diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..59f6988bc573e5d34b2226c41d8c31a156a6f8f8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,11 @@ +[submodule "3rdparty/composable_kernel"] + path = 3rdparty/composable_kernel + url = ../composable_kernel + branch = rel-5.7.1 + +[submodule "3rdparty/moe_c"] + path = 3rdparty/moe_c + url = ../Moe + branch = W8A8 + + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..ec5ac6db8a08ef90db82ff2048711230b3ebf548 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +Copyright © + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..0d8f6f5d5d3ce0f5c755fffda3522f49462941f7 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +graft aiter +graft aiter_meta \ No newline at end of file diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..33dc7261005384784d0252eb4ede7ed9b7046c3e 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,34 @@ +## Installation +method build for develop: +``` +git submodule update --init +python setup.py develop +``` +method build for whl package: +``` +bash das_build.sh +``` + +If you happen to forget the `--recursive` during `clone`, you can use the following command after `cd aiter` +``` +git submodule sync && git submodule update --init --recursive +``` + +## Run operators supported by aiter + +There are number of op test, you can run them with: `python3 op_tests/test_layernorm2d.py` +| **Ops** | **Description** | +|-------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| +|ELEMENT WISE | ops: + - * / | +|SIGMOID | (x) = 1 / (1 + e^-x) | +|AllREDUCE | Reduce + Broadcast | +|KVCACHE | W_K W_V | +|MHA | Multi-Head Attention | +|MLA | Multi-head Latent Attention with [KV-Cache layout](https://docs.flashinfer.ai/tutorials/kv_layout.html#page-table-layout ) | +|PA | Paged Attention | +|FusedMoe | Mixture of Experts | +|QUANT | BF16/FP16 -> FP8/INT4 | +|RMSNORM | root mean square | +|LAYERNORM | x = (x - u) / (σ2 + ϵ) e*0.5 | +|ROPE | Rotary Position Embedding | +|GEMM | D=αAβB+C | diff --git a/aiter/__init__.py b/aiter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fdee5193125d9fd2f51708eade2d09fadb20feee --- /dev/null +++ b/aiter/__init__.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: MIT + +import torch +import os +import logging + + +logger = logging.getLogger("aiter") + + +def getLogger(): + global logger + if not logger.handlers: + logger.setLevel(logging.DEBUG) + + console_handler = logging.StreamHandler() + if int(os.environ.get("AITER_LOG_MORE", 0)): + formatter = logging.Formatter( + fmt="[%(name)s %(levelname)s] %(asctime)s.%(msecs)03d - %(processName)s:%(process)d - %(pathname)s:%(lineno)d - %(funcName)s\n%(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + else: + formatter = logging.Formatter( + fmt="[%(name)s] %(message)s", + ) + console_handler.setFormatter(formatter) + console_handler.setLevel(logging.INFO) + + logger.addHandler(console_handler) + if hasattr(torch._dynamo.config, "ignore_logger_methods"): + torch._dynamo.config.ignore_logger_methods = ( + logging.Logger.info, + logging.Logger.warning, + logging.Logger.debug, + logger.warning, + logger.info, + logger.debug, + ) + + return logger + + +logger = getLogger() + +import importlib.util + +if importlib.util.find_spec("aiter_") is not None: + from aiter_ import * +from .jit import core +# from .ops.enum import * +from .ops.norm import * +from .ops.quant import * +# from .ops.gemm_op_a8w8 import * +# from .ops.batched_gemm_op_a8w8 import * +# from .ops.batched_gemm_op_bf16 import * +from .ops.aiter_operator import * +from .ops.activation import * +# from .ops.attention import * +# from .ops.custom import * +from .ops.custom_all_reduce import * +from .ops.moe_op import * +from .ops.moe_c_op import * +from .ops.moe_sorting import * +from .ops.pos_encoding import * +# from .ops.cache import * +from .ops.rmsnorm import * +from .ops.awq_gemm_asm import * +from .ops.awq_dq_asm import * +# from .ops.communication import * +from .ops.rope import * +from .ops.topk import * +# from .ops.mha import * +from .ops.gradlib import * +# from .ops.trans_ragged_layout import * +# from . import mla +from .utility import dtypes,fp4_utils diff --git a/aiter/awq_gemm_asm.py b/aiter/awq_gemm_asm.py new file mode 100644 index 0000000000000000000000000000000000000000..46c2b970df44f19e69cd154866940c33ab18afd5 --- /dev/null +++ b/aiter/awq_gemm_asm.py @@ -0,0 +1,156 @@ +import torch +import torch.nn.functional as F +import ctypes +from typing import Optional +import aiter +from aiter import ActivationType, QuantType, dtypes +from aiter.ops.awq_gemm_asm import * +from aiter.ops.shuffle import reverse_awq_order +from aiter.ops.awq_gemm_asm import awq_gemm_asm +from aiter.ops.awq_dq_asm import awq_dq_asm +def pack_int4_to_int8(low_4bits): + + if len(low_4bits) % 2 != 0: + low_4bits = torch.cat([low_4bits, torch.tensor([0], dtype=torch.uint8)]) + + # 3. 将相邻两个低4位拼成一个 int8 值 + # 偶数索引:左移4位作为高4位;奇数索引:低4位 + packed = (low_4bits[::2]) | (low_4bits[1::2] << 4) + packed = packed.to(torch.int8) # 转回 int8(有符号) + + return packed +def pack_int4_to_int8_64K(low_4bits): + + if len(low_4bits) % 2 != 0: + low_4bits = torch.cat([low_4bits, torch.tensor([0], dtype=torch.uint8)]) + + # 3. 将相邻两个低4位拼成一个 int8 值 + # 偶数索引:左移4位作为高4位;奇数索引:低4位 + packed = (low_4bits[::128]) | (low_4bits[64::128] << 4) + packed = packed.to(torch.int8) # 转回 int8(有符号) + + return packed + + +# qweight - [K, N // 8] +# qzeros - [K // G, N // 8] +# scales - [K // G, N] +def asm_awq_reorder_and_repack( + qweight: torch.Tensor, + qzeros: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + #WARNING: Only support awq group_size=64 + N = qweight.shape[1] * 8 + K = qweight.shape[0] + G = K // qzeros.shape[0] + assert K // qzeros.shape[0] == 64, "[ERROR] ASM_AWQ_GEMM not support K Groupsize other than 64!" + # assert (N % 512==0 or N==576), "[ERROR]ASM_AWQ_GEMM Not support Weight N other than 576 or multiplies of 512!" + device = qzeros.device + bits = 4 + shifts = torch.arange(0, 32, bits, device=device) + + iweights = torch.bitwise_right_shift( + qweight[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + iweights = iweights.view(iweights.shape[0], -1) + + zeros = torch.bitwise_right_shift( + qzeros[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + zeros = zeros.view(qzeros.shape[0], -1) + + zeros = reverse_awq_order(zeros) + iweights = reverse_awq_order(iweights) + + iweights = torch.bitwise_and(iweights, (2**bits) - 1) + zeros = torch.bitwise_and(zeros, (2**bits) - 1) + + iweights_packed = iweights.view(K, -1, 2) + zeros_packed = zeros.view(K//G, -1, 2) + + # Repack weight to int32 and pack along the K direction + # [K, N] -> [N, K] + # iweights = iweights.transpose(1, 0).contiguous() + packed_weights = torch.zeros([K, N//2], dtype=torch.int8, device=qweight.device) + packed_zeros = torch.zeros([K//G, N//2], dtype=torch.int8, device=zeros.device) + for i in range(2): + packed_weights |= (iweights_packed[:, :, i].to(torch.int8) << (i * bits)) + packed_zeros |= (zeros_packed[:, :, i].to(torch.int8) << (i * bits)) + + return packed_weights,packed_zeros + +def asm_awq_post_dequant_torch( + qweight: torch.Tensor, + scales: torch.Tensor, + qzeros: torch.Tensor, + group_size: int, +) -> torch.Tensor: + """Dequantize weights using PyTorch implementation. + + Args: + qweight: Quantized weight tensor + scales: Scale factors tensor + qzeros: Zero points tensor + group_size: Size of groups for quantization + + Returns: + Dequantized tensor + """ + if group_size == -1: + group_size = qweight.shape[0] + + bits = 4 + shifts = torch.arange(0, 8, bits, device=qzeros.device) + #只需要8 bit 展开 + iweights = torch.bitwise_right_shift( + qweight[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + iweights = iweights.view(iweights.shape[0], -1) + zeros = torch.bitwise_right_shift( + qzeros[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + zeros = zeros.view(qzeros.shape[0], -1) + + iweights = torch.bitwise_and(iweights, (2**bits) - 1) + zeros = torch.bitwise_and(zeros, (2**bits) - 1) + + scales = scales.repeat_interleave(group_size, dim=0) + zeros = zeros.repeat_interleave(group_size, dim=0) + return (iweights - zeros) * scales + +def asm_awq_post_dequant( + qweight: torch.Tensor, + scales: torch.Tensor, + qzeros: torch.Tensor, + group_size: int, +) -> torch.Tensor: + K = scales.shape[0] * group_size + N = scales.shape[-1] + # device = scales.device + out = torch.empty((K, N), dtype=scales.dtype, device=qweight.device) + awq_dq_asm(out, qweight, qzeros, scales) + return out + +# The inference function +# input - [m, k] +# qweight - [n, k // 2] +# qzeros - [k//g, n//2] +# scales - [k//g, n] +def asm_awq_gemm_a16w4(input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor) -> torch.tensor: + M,K = input.shape + N = scales.shape[1] + assert K % 256 == 0 + device = qzeros.device + out_asm = torch.empty((M, N), + dtype=input.dtype, + device=device) + awq_gemm_asm(out_asm, qweight, input, qzeros, scales) + # out_asm = out_asm.reshape(out_asm.shape[1], -1) + return out_asm \ No newline at end of file diff --git a/aiter/bert_padding.py b/aiter/bert_padding.py new file mode 100644 index 0000000000000000000000000000000000000000..3749260cd43432afff4fde0b9405a7eaa03e045b --- /dev/null +++ b/aiter/bert_padding.py @@ -0,0 +1,227 @@ + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat + + +class IndexFirstAxis(torch.autograd.Function): + @staticmethod + def forward(ctx, input, indices): + ctx.save_for_backward(indices) + assert input.ndim >= 2 + ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:] + second_dim = other_shape.numel() + # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing. + # return input[indices] + return torch.gather( + rearrange(input, "b ... -> b (...)"), + 0, + repeat(indices, "z -> z d", d=second_dim), + ).reshape(-1, *other_shape) + + @staticmethod + def backward(ctx, grad_output): + (indices,) = ctx.saved_tensors + assert grad_output.ndim >= 2 + other_shape = grad_output.shape[1:] + grad_output = rearrange(grad_output, "b ... -> b (...)") + grad_input = torch.zeros( + [ctx.first_axis_dim, grad_output.shape[1]], + device=grad_output.device, + dtype=grad_output.dtype, + ) + # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing. + # grad_input[indices] = grad_output + grad_input.scatter_( + 0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output + ) + return grad_input.reshape(ctx.first_axis_dim, *other_shape), None + + +index_first_axis = IndexFirstAxis.apply + + +class IndexPutFirstAxis(torch.autograd.Function): + @staticmethod + def forward(ctx, values, indices, first_axis_dim): + ctx.save_for_backward(indices) + assert indices.ndim == 1 + assert values.ndim >= 2 + output = torch.zeros( + first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype + ) + # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing. + output[indices] = values + # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values) + return output + + @staticmethod + def backward(ctx, grad_output): + (indices,) = ctx.saved_tensors + # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing. + grad_values = grad_output[indices] + # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1])) + return grad_values, None, None + + +index_put_first_axis = IndexPutFirstAxis.apply + + +class IndexFirstAxisResidual(torch.autograd.Function): + @staticmethod + def forward(ctx, input, indices): + ctx.save_for_backward(indices) + assert input.ndim >= 2 + ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:] + second_dim = other_shape.numel() + # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing. + output = input[indices] + # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last + # memory format to channel_first. In other words, input might not be contiguous. + # If we don't detach, Pytorch complains about output being a view and is being modified inplace + return output, input.detach() + + @staticmethod + def backward(ctx, grad_output, grad_residual): + (indices,) = ctx.saved_tensors + assert grad_output.ndim >= 2 + other_shape = grad_output.shape[1:] + assert grad_residual.shape[1:] == other_shape + grad_input = grad_residual + # grad_input[indices] += grad_output + indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1))) + indices = indices.expand_as(grad_output) + grad_input.scatter_add_(0, indices, grad_output) + return grad_input.reshape(ctx.first_axis_dim, *other_shape), None + + +index_first_axis_residual = IndexFirstAxisResidual.apply + + +def unpad_input(hidden_states, attention_mask, unused_mask=None): + """ + Arguments: + hidden_states: (batch, seqlen, ...) + attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid. + unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused. + Return: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask. + indices: (total_nnz), the indices of masked tokens from the flattened input sequence. + cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states. + max_seqlen_in_batch: int + seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask. + """ + all_masks = ( + (attention_mask + unused_mask) if unused_mask is not None else attention_mask + ) + seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32) + used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the + # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim + # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to + # index with integer indices. Moreover, torch's index is a bit slower than it needs to be, + # so we write custom forward and backward to make it a bit faster. + return ( + index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices), + indices, + cu_seqlens, + max_seqlen_in_batch, + used_seqlens_in_batch, + ) + + +def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length): + """ + Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model). + The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286). + + For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is: + ``` + [ + [2, 3, 0, 0, 0, 0], + [3, 2, 0, 0, 0, 0], + [6, 0, 0, 0, 0, 0] + ] + ``` + , which refers to the 3D-attention mask: + ``` + [ + [ + [1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 1, 1, 0, 0], + [0, 0, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 1] + ], + [ + [1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0], + [0, 0, 0, 0, 0, 1] + ], + [ + [1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1] + ] + ] + ```. + + Arguments: + hidden_states: (batch, seqlen, ...) + attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none. + Return: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask. + indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence. + cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states. + max_seqlen_in_batch: int + """ + length = attention_mask_in_length.sum(dim=-1) + seqlen = attention_mask_in_length.size(-1) + attention_mask_2d = torch.arange( + seqlen, device=length.device, dtype=length.dtype + ).expand(len(length), seqlen) < length.unsqueeze(1) + real_indices_idx = torch.nonzero( + attention_mask_in_length.flatten(), as_tuple=False + ).flatten() + seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx] + indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the + # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim + # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to + # index with integer indices. Moreover, torch's index is a bit slower than it needs to be, + # so we write custom forward and backward to make it a bit faster. + return ( + index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices), + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def pad_input(hidden_states, indices, batch, seqlen): + """ + Arguments: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask. + indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence. + batch: int, batch size for the padded sequence. + seqlen: int, maximum sequence length for the padded sequence. + Return: + hidden_states: (batch, seqlen, ...) + """ + dim = hidden_states.shape[-1] + # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype) + # output[indices] = hidden_states + output = index_put_first_axis(hidden_states, indices, batch * seqlen) + return rearrange(output, "(b s) ... -> b s ...", b=batch) diff --git a/aiter/blaslt_scale_mm.py b/aiter/blaslt_scale_mm.py new file mode 100644 index 0000000000000000000000000000000000000000..9cc70ad0b99341feea1fd013195c8d34d116f8d3 --- /dev/null +++ b/aiter/blaslt_scale_mm.py @@ -0,0 +1,38 @@ +import os +from pathlib import Path +import functools +import pandas as pd +import torch +import torch.nn.functional as F +from aiter import hipb_create_extension, hipb_mm, getHipblasltKernelName +from aiter import rocb_create_extension, rocb_mm +from aiter import logger, dtypes +from aiter.jit.utils.torch_guard import torch_compile_guard +from typing import Optional + +extensions_created = False +@torch_compile_guard() +def scale_mm( + inp: torch.Tensor, + weights: torch.Tensor, + bias: Optional[torch.Tensor] = None, + otype: Optional[torch.dtype] = None, + scale_a: Optional[torch.Tensor] = None, + scale_b: Optional[torch.Tensor] = None, + scale_c: Optional[torch.Tensor] = None, + scale_type: Optional[int] = None, +)-> torch.Tensor: + # scale_type=0, scalar scale + # scale_type=1, channel scale + # scale_type=2, block scale + global extensions_created + if otype is None: + otype = inp.dtype + if extensions_created == False: + hipb_create_extension() + extensions_created = True + if inp.dim() >= 3: + assert(False, "not support 3dim input") + + inp_view = inp + return hipb_mm(inp_view, weights.t(), -1, bias, otype, scale_a, scale_b, scale_c, scale_type) \ No newline at end of file diff --git a/aiter/configs/__init__.py b/aiter/configs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/configs/a8w8_blockscale_untuned_gemm.csv b/aiter/configs/a8w8_blockscale_untuned_gemm.csv new file mode 100644 index 0000000000000000000000000000000000000000..8a8bcd71547dcf46bc8f7a972b64cef9e56f4098 --- /dev/null +++ b/aiter/configs/a8w8_blockscale_untuned_gemm.csv @@ -0,0 +1,234 @@ +M,N,K +16, 1536, 7168 + +16, 3072, 1536 + +16, 576, 7168 + +16, 7168, 256 + +16, 7168, 2048 + +16, 4608, 7168 + +16, 7168, 2304 + +16, 512, 7168 + +16, 4096, 512 + +32, 1536, 7168 + +32, 3072, 1536 + +32, 576, 7168 + +32, 7168, 256 + +32, 7168, 2048 + +32, 4608, 7168 + +32, 7168, 2304 + +32, 512, 7168 + +32, 4096, 512 + +64, 1536, 7168 + +64, 3072, 1536 + +64, 576, 7168 + +64, 7168, 256 + +64, 7168, 2048 + +64, 4608, 7168 + +64, 7168, 2304 + +64, 512, 7168 + +64, 4096, 512 + +128, 1536, 7168 + +128, 3072, 1536 + +128, 576, 7168 + +128, 7168, 256 + +128, 7168, 2048 + +128, 4608, 7168 + +128, 7168, 2304 + +128, 512, 7168 + +128, 4096, 512 + +256, 1536, 7168 + +256, 3072, 1536 + +256, 576, 7168 + +256, 7168, 256 + +256, 7168, 2048 + +256, 4608, 7168 + +256, 7168, 2304 + +256, 512, 7168 + +256, 4096, 512 + +512, 1536, 7168 + +512, 3072, 1536 + +512, 576, 7168 + +512, 7168, 256 + +512, 7168, 2048 + +512, 4608, 7168 + +512, 7168, 2304 + +512, 512, 7168 + +512, 4096, 512 + +1024, 1536, 7168 + +1024, 3072, 1536 + +1024, 576, 7168 + +1024, 7168, 256 + +1024, 7168, 2048 + +1024, 4608, 7168 + +1024, 7168, 2304 + +1024, 512, 7168 + +1024, 4096, 512 + +1536, 1536, 7168 + +1536, 3072, 1536 + +1536, 576, 7168 + +1536, 7168, 256 + +1536, 7168, 2048 + +1536, 4608, 7168 + +1536, 7168, 2304 + +1536, 512, 7168 + +1536, 4096, 512 + +2048, 1536, 7168 + +2048, 3072, 1536 + +2048, 576, 7168 + +2048, 7168, 256 + +2048, 7168, 2048 + +2048, 4608, 7168 + +2048, 7168, 2304 + +2048, 512, 7168 + +2048, 4096, 512 + +4096, 1536, 7168 + +4096, 3072, 1536 + +4096, 576, 7168 + +4096, 7168, 256 + +4096, 7168, 2048 + +4096, 4608, 7168 + +4096, 7168, 2304 + +4096, 512, 7168 + +4096, 4096, 512 + +8192, 1536, 7168 + +8192, 3072, 1536 + +8192, 576, 7168 + +8192, 7168, 256 + +8192, 7168, 2048 + +8192, 4608, 7168 + +8192, 7168, 2304 + +8192, 512, 7168 + +8192, 4096, 512 + +16384, 1536, 7168 + +16384, 3072, 1536 + +16384, 576, 7168 + +16384, 7168, 256 + +16384, 7168, 2048 + +16384, 4608, 7168 + +16384, 7168, 2304 + +16384, 512, 7168 + +16384, 4096, 512 + +20480, 1536, 7168 + +20480, 3072, 1536 + +20480, 576, 7168 + +20480, 7168, 256 + +20480, 7168, 2048 + +20480, 4608, 7168 + +20480, 7168, 2304 + +20480, 512, 7168 + +20480, 4096, 512 \ No newline at end of file diff --git a/aiter/configs/a8w8_untuned_batched_gemm.csv b/aiter/configs/a8w8_untuned_batched_gemm.csv new file mode 100644 index 0000000000000000000000000000000000000000..849a7f7697041b866bc72a985a0cfb9e231cca3c --- /dev/null +++ b/aiter/configs/a8w8_untuned_batched_gemm.csv @@ -0,0 +1,27 @@ +B,M,N,K +16, 1, 1280, 8192 +16, 32, 1280, 8192 +16, 64, 1280, 8192 +16, 128, 1280, 8192 +16, 192, 1280, 8192 +16, 256, 1280, 8192 +16, 320, 1280, 8192 +16, 512, 1280, 8192 +16, 1024, 1280, 8192 +16, 2048, 1280, 8192 +16, 4096, 1280, 8192 +16, 8192, 1280, 8192 +16, 16384, 1280, 8192 +16, 1, 8192, 1024 +16, 32, 8192, 1024 +16, 64, 8192, 1024 +16, 128, 8192, 1024 +16, 192, 8192, 1024 +16, 256, 8192, 1024 +16, 320, 8192, 1024 +16, 512, 8192, 1024 +16, 1024, 8192, 1024 +16, 2048, 8192, 1024 +16, 4096, 8192, 1024 +16, 8192, 8192, 1024 +16, 16384, 8192, 1024 diff --git a/aiter/configs/a8w8_untuned_gemm.csv b/aiter/configs/a8w8_untuned_gemm.csv new file mode 100644 index 0000000000000000000000000000000000000000..05a50f320f75bc567e9731e37ef5ecb987a8054f --- /dev/null +++ b/aiter/configs/a8w8_untuned_gemm.csv @@ -0,0 +1,27 @@ +M,N,K +1, 1280, 8192 +32, 1280, 8192 +64, 1280, 8192 +128, 1280, 8192 +192, 1280, 8192 +256, 1280, 8192 +320, 1280, 8192 +512, 1280, 8192 +1024, 1280, 8192 +2048, 1280, 8192 +4096, 1280, 8192 +8192, 1280, 8192 +16384, 1280, 8192 +1, 8192, 1024 +32, 8192, 1024 +64, 8192, 1024 +128, 8192, 1024 +192, 8192, 1024 +256, 8192, 1024 +320, 8192, 1024 +512, 8192, 1024 +1024, 8192, 1024 +2048, 8192, 1024 +4096, 8192, 1024 +8192, 8192, 1024 +16384, 8192, 1024 diff --git a/aiter/configs/asm_a8w8_gemm.csv b/aiter/configs/asm_a8w8_gemm.csv new file mode 100644 index 0000000000000000000000000000000000000000..169f175eb4d11058c389306b5bebb77f810ed6db --- /dev/null +++ b/aiter/configs/asm_a8w8_gemm.csv @@ -0,0 +1,21 @@ +M,N,K,bias,outdtype,splitK,us +128,1280,8192,True,torch.bfloat16,3,13.85 +192,1280,8192,True,torch.bfloat16,3,13.90 +256,1280,8192,True,torch.bfloat16,3,25.54 +320,1280,8192,True,torch.bfloat16,3,25.56 +512,1280,8192,True,torch.bfloat16,3,48.06 +1024,1280,8192,True,torch.bfloat16,3,94.45 +2048,1280,8192,True,torch.bfloat16,3,186.90 +4096,1280,8192,True,torch.bfloat16,3,371.85 +8192,1280,8192,True,torch.bfloat16,3,742.90 +16384,1280,8192,True,torch.bfloat16,3,1483.54 +128,8192,1024,True,torch.bfloat16,0,12.67 +192,8192,1024,True,torch.bfloat16,0,12.70 +256,8192,1024,True,torch.bfloat16,0,23.80 +320,8192,1024,True,torch.bfloat16,0,23.82 +512,8192,1024,True,torch.bfloat16,0,44.61 +1024,8192,1024,True,torch.bfloat16,0,76.33 +2048,8192,1024,True,torch.bfloat16,0,140.05 +4096,8192,1024,True,torch.bfloat16,0,277.80 +8192,8192,1024,True,torch.bfloat16,0,552.69 +16384,8192,1024,True,torch.bfloat16,0,1095.12 \ No newline at end of file diff --git a/aiter/configs/asm_tune/awq_NN_solutions.json b/aiter/configs/asm_tune/awq_NN_solutions.json new file mode 100644 index 0000000000000000000000000000000000000000..28fef7417c6dec74e3cde29dc22d3be7c3fd0b43 --- /dev/null +++ b/aiter/configs/asm_tune/awq_NN_solutions.json @@ -0,0 +1,67 @@ +{ + "tunedCSV": "tuned_awq_gemm_NN.csv", + + "kernels": [ + { + "solutionId": 0, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 64, "mt1": 32, "numThreads": 512, "wgm": 1 } + }, + + { + "solutionId": 1, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 64, "mt1": 64, "numThreads": 512, "wgm": 1 } + }, + { + "solutionId": 2, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 64, "mt1": 128, "numThreads": 512, "wgm": 1 } + }, + + { + "solutionId": 3, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 16, "mt1": 32, "numThreads": 512, "wgm": 1 } + }, + + { + "solutionId": 4, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.co", + "Kconfigs": { "mt0": 64, "mt1": 32, "numThreads": 768, "wgm": 1 } + }, + { + "solutionId": 5, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.co", + "Kconfigs": { "mt0": 64, "mt1": 64, "numThreads": 768, "wgm": 1 } + }, + { + "solutionId": 6, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.co", + "Kconfigs": { "mt0": 64, "mt1": 128, "numThreads": 768, "wgm": 1 } + } + + + ], + + "Untunedkernels": [ + { + "solutionId": 4, + "kernel_name": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 32, "mt1": 32, "numThreads": 768, "wgm": 1 } + } + ] +} + + + + + diff --git a/aiter/configs/asm_tune/awq_bf16_NN_solutions.json b/aiter/configs/asm_tune/awq_bf16_NN_solutions.json new file mode 100644 index 0000000000000000000000000000000000000000..20e30aa32c104ba2483023ac19ddea31e03cd7e0 --- /dev/null +++ b/aiter/configs/asm_tune/awq_bf16_NN_solutions.json @@ -0,0 +1,67 @@ +{ + "tunedCSV": "tuned_awq_bf16_gemm_NN.csv", + + "kernels": [ + { + "solutionId": 0, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 64, "mt1": 32, "numThreads": 512, "wgm": 1 } + }, + + { + "solutionId": 1, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 64, "mt1": 64, "numThreads": 512, "wgm": 1 } + }, + { + "solutionId": 2, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 64, "mt1": 128, "numThreads": 512, "wgm": 1 } + }, + + { + "solutionId": 3, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 16, "mt1": 32, "numThreads": 512, "wgm": 1 } + }, + + { + "solutionId": 4, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.co", + "Kconfigs": { "mt0": 64, "mt1": 32, "numThreads": 768, "wgm": 1 } + }, + { + "solutionId": 5, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.co", + "Kconfigs": { "mt0": 64, "mt1": 64, "numThreads": 768, "wgm": 1 } + }, + { + "solutionId": 6, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.co", + "Kconfigs": { "mt0": 64, "mt1": 128, "numThreads": 768, "wgm": 1 } + } + + + ], + + "Untunedkernels": [ + { + "solutionId": 4, + "kernel_name": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3", + "co_file": "Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.co", + "Kconfigs": { "mt0": 32, "mt1": 32, "numThreads": 768, "wgm": 1 } + } + ] +} + + + + + diff --git a/aiter/configs/asm_tune/cmd b/aiter/configs/asm_tune/cmd new file mode 100644 index 0000000000000000000000000000000000000000..f9e7bc2096b4548e48bd991db59da654521b35e5 --- /dev/null +++ b/aiter/configs/asm_tune/cmd @@ -0,0 +1,5 @@ +GPU_ARCHS=gfx936 python3 gradlib/gradlib/gemm_tuner.py \ +--tuned_file aiter/configs/asm_tune/tuned_awq_gemm_NN.csv \ +--inputSols_file aiter/configs/asm_tune/awq_NN_solutions.json \ +--input_file aiter/configs/asm_tune/untuned_awqgemm_NN.csv \ +--warmupIters 1 --runIters 3 --fastNoCheck 1 --hsacoOnly 1 \ No newline at end of file diff --git a/aiter/configs/asm_tune/tuned_awq_bf16_gemm_NN.csv b/aiter/configs/asm_tune/tuned_awq_bf16_gemm_NN.csv new file mode 100644 index 0000000000000000000000000000000000000000..0cd871ba05922b86c27fa5f83d9c52c0779538b5 --- /dev/null +++ b/aiter/configs/asm_tune/tuned_awq_bf16_gemm_NN.csv @@ -0,0 +1,4609 @@ +M,N,K,G,bias,dtype,outdtype,scaleAB,awqgemm,libtype,solidx,soltimes,kernelName +1,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,68.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,70.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,79.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,80.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,42.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,42.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,42.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,79.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,42.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,17.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,42.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,78.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,64.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,26.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,34.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,65.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,65.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,26.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,66.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,26.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,66.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,26.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,66.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,67.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,67.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,34.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,37.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,68.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,68.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,69.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,68.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,69.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,69.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,69.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,24.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,27.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,70.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,36.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +49,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,71.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,71.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,72.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,72.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,72.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,72.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,73.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,74.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,74.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +65,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +66,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,34.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +67,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +68,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +69,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,34.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,101.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +70,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +71,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,34.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +72,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +73,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +74,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,15.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,34.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +75,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,101.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +76,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,15.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,5.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,10.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +77,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +78,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +79,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +80,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +81,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,101.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +82,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +83,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +84,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,100.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +85,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +86,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +87,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +88,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +89,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +90,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +91,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +92,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +93,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +94,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,101.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +95,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +96,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +97,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +98,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +99,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +100,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,36.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +101,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +102,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +103,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +104,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,37.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +105,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +106,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,14.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +107,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +108,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +109,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,102.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +110,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,38.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +111,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +112,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,103.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +113,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +114,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +115,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +116,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,40.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +117,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,106.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +118,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,105.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +119,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,30.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,107.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +120,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +121,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +122,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +123,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,30.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,87.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,105.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +124,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,105.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +125,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,88.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,105.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +126,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,105.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +127,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,29.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.16,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.84,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,104.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,576,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,3072,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,4096,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,4608,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,7168,256,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,2,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +128,576,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,12.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,512,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,16.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,576,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,1536,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,30.32,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,512,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.76,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,2048,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,36.4,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,512,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,16.56,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,2304,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,40.72,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,512,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,3,41.12,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,83.68,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,85.04,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,5,86.24,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,7168,64,0,torch.bfloat16,torch.bfloat16,0.0,1.0,hsaco,6,106.64,Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 diff --git a/aiter/configs/asm_tune/tuned_awq_gemm_NN.csv b/aiter/configs/asm_tune/tuned_awq_gemm_NN.csv new file mode 100644 index 0000000000000000000000000000000000000000..5abcb730426b5edc978c203fb02dc5e340b696ed --- /dev/null +++ b/aiter/configs/asm_tune/tuned_awq_gemm_NN.csv @@ -0,0 +1,4609 @@ +M,N,K,G,bias,dtype,outdtype,scaleAB,awqgemm,libtype,solidx,soltimes,kernelName +1,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,68.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +1,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +1,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,70.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +2,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +2,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +3,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +3,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +4,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +4,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +5,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +5,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +6,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +6,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +7,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +7,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +8,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +8,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +9,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +9,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +10,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +10,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +11,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +11,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +12,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +12,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +13,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +13,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +14,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +14,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +15,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +15,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +16,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +16,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +17,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +17,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +18,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +18,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +19,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +19,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,79.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +20,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +20,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +21,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +21,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +22,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +22,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +23,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +23,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,80.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +24,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +24,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +25,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +25,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +26,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +26,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +27,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +27,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,42.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +28,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +28,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,42.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +29,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +29,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,42.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +30,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +30,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,79.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,42.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +31,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +31,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,17.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,42.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +32,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +32,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,78.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,64.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +33,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +33,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,26.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,34.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,65.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +34,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +34,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,65.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +35,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +35,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,26.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,66.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +36,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +36,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,26.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,66.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +37,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +37,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,26.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,66.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +38,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +38,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,67.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +39,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +39,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,67.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +40,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +40,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,34.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,37.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,68.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +41,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +41,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,68.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +42,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +42,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,69.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +43,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +43,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,68.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +44,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +44,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,69.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +45,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +45,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,69.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +46,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +46,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,69.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +47,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +47,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,24.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,27.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,70.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +48,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +48,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,36.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +49,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,71.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +49,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +49,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,71.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +50,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +50,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,72.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +51,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +51,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,72.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +52,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +52,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,72.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +53,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +53,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,72.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +54,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +54,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,73.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +55,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +55,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +56,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +56,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,74.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +57,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +57,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,74.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +58,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +58,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +59,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +59,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +60,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +60,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +61,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +61,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +62,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +62,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +63,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +63,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,9.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +64,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,77.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +64,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +65,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +65,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +65,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +66,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,34.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +66,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +66,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +67,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +67,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +67,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +68,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +68,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +68,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +69,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,34.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +69,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +69,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,101.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +70,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +70,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +70,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +71,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,34.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +71,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +71,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +72,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +72,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +72,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +73,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +73,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +73,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +74,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,15.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,34.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +74,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +74,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +75,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +75,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +75,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,101.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +76,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,15.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +76,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +76,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,5.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,10.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +77,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +77,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +77,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +78,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +78,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +78,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +79,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +79,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +79,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +80,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +80,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +80,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +81,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +81,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +81,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,101.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +82,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +82,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +82,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +83,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +83,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +83,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +84,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +84,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +84,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,100.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +85,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +85,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +85,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +86,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +86,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +86,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +87,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +87,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +87,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +88,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +88,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +88,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +89,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +89,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +89,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +90,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +90,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +90,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +91,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +91,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +91,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +92,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +92,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +92,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +93,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +93,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +93,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +94,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +94,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,76.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +94,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,101.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +95,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +95,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +95,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +96,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,11.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,21.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,26.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +96,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,75.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +96,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +97,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +97,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +97,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +98,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,13.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +98,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +98,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +99,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +99,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +99,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +100,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,36.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +100,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +100,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +101,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +101,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +101,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +102,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,15.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +102,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +102,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +103,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +103,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +103,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +104,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,37.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +104,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +104,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +105,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +105,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +105,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +106,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,14.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +106,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +106,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +107,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +107,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +107,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +108,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +108,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +108,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +109,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,31.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +109,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +109,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,102.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +110,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,38.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +110,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +110,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +111,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +111,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +111,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +112,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +112,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +112,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,103.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +113,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +113,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,73.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +113,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +114,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +114,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +114,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +115,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +115,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +115,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +116,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,40.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +116,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +116,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +117,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,30.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +117,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +117,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,106.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +118,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +118,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +118,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,105.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +119,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,30.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +119,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +119,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,107.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +120,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,39.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +120,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +120,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +121,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,11.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +121,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +121,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +122,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +122,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +122,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +123,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,30.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.08,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +123,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,87.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +123,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,105.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +124,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +124,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,84.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +124,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,105.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +125,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,28.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.44,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +125,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,88.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +125,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,105.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +126,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,27.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +126,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +126,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,105.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +127,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,29.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.28,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,35.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.88,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +127,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.16,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.84,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +127,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,104.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,576,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,6.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,0,8.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,3072,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,10.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,4096,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,4608,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,1,9.6,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,7168,256,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,2,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +128,576,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,7.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,10.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,13.52,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,12.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,512,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,16.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,576,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,12.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,20.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,23.92,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,24.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,1536,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,30.32,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,512,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,14.96,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,25.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.76,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,29.36,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,2048,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,36.4,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,512,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,16.56,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,28.8,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,32.48,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,33.2,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,2304,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,40.72,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,512,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,3,41.12,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +128,1536,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,4,74.0,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,3072,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,83.68,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4096,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,85.04,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,4608,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,5,86.24,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +128,7168,7168,64,0,torch.float16,torch.float16,0.0,1.0,hsaco,6,106.64,Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 diff --git a/aiter/configs/asm_tune/untuned_awqgemm_NN.csv b/aiter/configs/asm_tune/untuned_awqgemm_NN.csv new file mode 100644 index 0000000000000000000000000000000000000000..7560bdc78e17b6e43b69a31f8df79e4d01085891 --- /dev/null +++ b/aiter/configs/asm_tune/untuned_awqgemm_NN.csv @@ -0,0 +1,4609 @@ +M,N,K,G,bias,dtype,outdtype,scaleAB,awqgemm +1,576,256,64,0,f16,f16,0,1 +1,1536,256,64,0,f16,f16,0,1 +1,3072,256,64,0,f16,f16,0,1 +1,4096,256,64,0,f16,f16,0,1 +1,4608,256,64,0,f16,f16,0,1 +1,7168,256,64,0,f16,f16,0,1 +1,576,512,64,0,f16,f16,0,1 +1,1536,512,64,0,f16,f16,0,1 +1,3072,512,64,0,f16,f16,0,1 +1,4096,512,64,0,f16,f16,0,1 +1,4608,512,64,0,f16,f16,0,1 +1,7168,512,64,0,f16,f16,0,1 +1,576,1536,64,0,f16,f16,0,1 +1,1536,1536,64,0,f16,f16,0,1 +1,3072,1536,64,0,f16,f16,0,1 +1,4096,1536,64,0,f16,f16,0,1 +1,4608,1536,64,0,f16,f16,0,1 +1,7168,1536,64,0,f16,f16,0,1 +1,512,2048,64,0,f16,f16,0,1 +1,1536,2048,64,0,f16,f16,0,1 +1,3072,2048,64,0,f16,f16,0,1 +1,4096,2048,64,0,f16,f16,0,1 +1,4608,2048,64,0,f16,f16,0,1 +1,7168,2048,64,0,f16,f16,0,1 +1,512,2304,64,0,f16,f16,0,1 +1,1536,2304,64,0,f16,f16,0,1 +1,3072,2304,64,0,f16,f16,0,1 +1,4096,2304,64,0,f16,f16,0,1 +1,4608,2304,64,0,f16,f16,0,1 +1,7168,2304,64,0,f16,f16,0,1 +1,512,7168,64,0,f16,f16,0,1 +1,1536,7168,64,0,f16,f16,0,1 +1,3072,7168,64,0,f16,f16,0,1 +1,4096,7168,64,0,f16,f16,0,1 +1,4608,7168,64,0,f16,f16,0,1 +1,7168,7168,64,0,f16,f16,0,1 +2,576,256,64,0,f16,f16,0,1 +2,1536,256,64,0,f16,f16,0,1 +2,3072,256,64,0,f16,f16,0,1 +2,4096,256,64,0,f16,f16,0,1 +2,4608,256,64,0,f16,f16,0,1 +2,7168,256,64,0,f16,f16,0,1 +2,576,512,64,0,f16,f16,0,1 +2,1536,512,64,0,f16,f16,0,1 +2,3072,512,64,0,f16,f16,0,1 +2,4096,512,64,0,f16,f16,0,1 +2,4608,512,64,0,f16,f16,0,1 +2,7168,512,64,0,f16,f16,0,1 +2,576,1536,64,0,f16,f16,0,1 +2,1536,1536,64,0,f16,f16,0,1 +2,3072,1536,64,0,f16,f16,0,1 +2,4096,1536,64,0,f16,f16,0,1 +2,4608,1536,64,0,f16,f16,0,1 +2,7168,1536,64,0,f16,f16,0,1 +2,512,2048,64,0,f16,f16,0,1 +2,1536,2048,64,0,f16,f16,0,1 +2,3072,2048,64,0,f16,f16,0,1 +2,4096,2048,64,0,f16,f16,0,1 +2,4608,2048,64,0,f16,f16,0,1 +2,7168,2048,64,0,f16,f16,0,1 +2,512,2304,64,0,f16,f16,0,1 +2,1536,2304,64,0,f16,f16,0,1 +2,3072,2304,64,0,f16,f16,0,1 +2,4096,2304,64,0,f16,f16,0,1 +2,4608,2304,64,0,f16,f16,0,1 +2,7168,2304,64,0,f16,f16,0,1 +2,512,7168,64,0,f16,f16,0,1 +2,1536,7168,64,0,f16,f16,0,1 +2,3072,7168,64,0,f16,f16,0,1 +2,4096,7168,64,0,f16,f16,0,1 +2,4608,7168,64,0,f16,f16,0,1 +2,7168,7168,64,0,f16,f16,0,1 +3,576,256,64,0,f16,f16,0,1 +3,1536,256,64,0,f16,f16,0,1 +3,3072,256,64,0,f16,f16,0,1 +3,4096,256,64,0,f16,f16,0,1 +3,4608,256,64,0,f16,f16,0,1 +3,7168,256,64,0,f16,f16,0,1 +3,576,512,64,0,f16,f16,0,1 +3,1536,512,64,0,f16,f16,0,1 +3,3072,512,64,0,f16,f16,0,1 +3,4096,512,64,0,f16,f16,0,1 +3,4608,512,64,0,f16,f16,0,1 +3,7168,512,64,0,f16,f16,0,1 +3,576,1536,64,0,f16,f16,0,1 +3,1536,1536,64,0,f16,f16,0,1 +3,3072,1536,64,0,f16,f16,0,1 +3,4096,1536,64,0,f16,f16,0,1 +3,4608,1536,64,0,f16,f16,0,1 +3,7168,1536,64,0,f16,f16,0,1 +3,512,2048,64,0,f16,f16,0,1 +3,1536,2048,64,0,f16,f16,0,1 +3,3072,2048,64,0,f16,f16,0,1 +3,4096,2048,64,0,f16,f16,0,1 +3,4608,2048,64,0,f16,f16,0,1 +3,7168,2048,64,0,f16,f16,0,1 +3,512,2304,64,0,f16,f16,0,1 +3,1536,2304,64,0,f16,f16,0,1 +3,3072,2304,64,0,f16,f16,0,1 +3,4096,2304,64,0,f16,f16,0,1 +3,4608,2304,64,0,f16,f16,0,1 +3,7168,2304,64,0,f16,f16,0,1 +3,512,7168,64,0,f16,f16,0,1 +3,1536,7168,64,0,f16,f16,0,1 +3,3072,7168,64,0,f16,f16,0,1 +3,4096,7168,64,0,f16,f16,0,1 +3,4608,7168,64,0,f16,f16,0,1 +3,7168,7168,64,0,f16,f16,0,1 +4,576,256,64,0,f16,f16,0,1 +4,1536,256,64,0,f16,f16,0,1 +4,3072,256,64,0,f16,f16,0,1 +4,4096,256,64,0,f16,f16,0,1 +4,4608,256,64,0,f16,f16,0,1 +4,7168,256,64,0,f16,f16,0,1 +4,576,512,64,0,f16,f16,0,1 +4,1536,512,64,0,f16,f16,0,1 +4,3072,512,64,0,f16,f16,0,1 +4,4096,512,64,0,f16,f16,0,1 +4,4608,512,64,0,f16,f16,0,1 +4,7168,512,64,0,f16,f16,0,1 +4,576,1536,64,0,f16,f16,0,1 +4,1536,1536,64,0,f16,f16,0,1 +4,3072,1536,64,0,f16,f16,0,1 +4,4096,1536,64,0,f16,f16,0,1 +4,4608,1536,64,0,f16,f16,0,1 +4,7168,1536,64,0,f16,f16,0,1 +4,512,2048,64,0,f16,f16,0,1 +4,1536,2048,64,0,f16,f16,0,1 +4,3072,2048,64,0,f16,f16,0,1 +4,4096,2048,64,0,f16,f16,0,1 +4,4608,2048,64,0,f16,f16,0,1 +4,7168,2048,64,0,f16,f16,0,1 +4,512,2304,64,0,f16,f16,0,1 +4,1536,2304,64,0,f16,f16,0,1 +4,3072,2304,64,0,f16,f16,0,1 +4,4096,2304,64,0,f16,f16,0,1 +4,4608,2304,64,0,f16,f16,0,1 +4,7168,2304,64,0,f16,f16,0,1 +4,512,7168,64,0,f16,f16,0,1 +4,1536,7168,64,0,f16,f16,0,1 +4,3072,7168,64,0,f16,f16,0,1 +4,4096,7168,64,0,f16,f16,0,1 +4,4608,7168,64,0,f16,f16,0,1 +4,7168,7168,64,0,f16,f16,0,1 +5,576,256,64,0,f16,f16,0,1 +5,1536,256,64,0,f16,f16,0,1 +5,3072,256,64,0,f16,f16,0,1 +5,4096,256,64,0,f16,f16,0,1 +5,4608,256,64,0,f16,f16,0,1 +5,7168,256,64,0,f16,f16,0,1 +5,576,512,64,0,f16,f16,0,1 +5,1536,512,64,0,f16,f16,0,1 +5,3072,512,64,0,f16,f16,0,1 +5,4096,512,64,0,f16,f16,0,1 +5,4608,512,64,0,f16,f16,0,1 +5,7168,512,64,0,f16,f16,0,1 +5,576,1536,64,0,f16,f16,0,1 +5,1536,1536,64,0,f16,f16,0,1 +5,3072,1536,64,0,f16,f16,0,1 +5,4096,1536,64,0,f16,f16,0,1 +5,4608,1536,64,0,f16,f16,0,1 +5,7168,1536,64,0,f16,f16,0,1 +5,512,2048,64,0,f16,f16,0,1 +5,1536,2048,64,0,f16,f16,0,1 +5,3072,2048,64,0,f16,f16,0,1 +5,4096,2048,64,0,f16,f16,0,1 +5,4608,2048,64,0,f16,f16,0,1 +5,7168,2048,64,0,f16,f16,0,1 +5,512,2304,64,0,f16,f16,0,1 +5,1536,2304,64,0,f16,f16,0,1 +5,3072,2304,64,0,f16,f16,0,1 +5,4096,2304,64,0,f16,f16,0,1 +5,4608,2304,64,0,f16,f16,0,1 +5,7168,2304,64,0,f16,f16,0,1 +5,512,7168,64,0,f16,f16,0,1 +5,1536,7168,64,0,f16,f16,0,1 +5,3072,7168,64,0,f16,f16,0,1 +5,4096,7168,64,0,f16,f16,0,1 +5,4608,7168,64,0,f16,f16,0,1 +5,7168,7168,64,0,f16,f16,0,1 +6,576,256,64,0,f16,f16,0,1 +6,1536,256,64,0,f16,f16,0,1 +6,3072,256,64,0,f16,f16,0,1 +6,4096,256,64,0,f16,f16,0,1 +6,4608,256,64,0,f16,f16,0,1 +6,7168,256,64,0,f16,f16,0,1 +6,576,512,64,0,f16,f16,0,1 +6,1536,512,64,0,f16,f16,0,1 +6,3072,512,64,0,f16,f16,0,1 +6,4096,512,64,0,f16,f16,0,1 +6,4608,512,64,0,f16,f16,0,1 +6,7168,512,64,0,f16,f16,0,1 +6,576,1536,64,0,f16,f16,0,1 +6,1536,1536,64,0,f16,f16,0,1 +6,3072,1536,64,0,f16,f16,0,1 +6,4096,1536,64,0,f16,f16,0,1 +6,4608,1536,64,0,f16,f16,0,1 +6,7168,1536,64,0,f16,f16,0,1 +6,512,2048,64,0,f16,f16,0,1 +6,1536,2048,64,0,f16,f16,0,1 +6,3072,2048,64,0,f16,f16,0,1 +6,4096,2048,64,0,f16,f16,0,1 +6,4608,2048,64,0,f16,f16,0,1 +6,7168,2048,64,0,f16,f16,0,1 +6,512,2304,64,0,f16,f16,0,1 +6,1536,2304,64,0,f16,f16,0,1 +6,3072,2304,64,0,f16,f16,0,1 +6,4096,2304,64,0,f16,f16,0,1 +6,4608,2304,64,0,f16,f16,0,1 +6,7168,2304,64,0,f16,f16,0,1 +6,512,7168,64,0,f16,f16,0,1 +6,1536,7168,64,0,f16,f16,0,1 +6,3072,7168,64,0,f16,f16,0,1 +6,4096,7168,64,0,f16,f16,0,1 +6,4608,7168,64,0,f16,f16,0,1 +6,7168,7168,64,0,f16,f16,0,1 +7,576,256,64,0,f16,f16,0,1 +7,1536,256,64,0,f16,f16,0,1 +7,3072,256,64,0,f16,f16,0,1 +7,4096,256,64,0,f16,f16,0,1 +7,4608,256,64,0,f16,f16,0,1 +7,7168,256,64,0,f16,f16,0,1 +7,576,512,64,0,f16,f16,0,1 +7,1536,512,64,0,f16,f16,0,1 +7,3072,512,64,0,f16,f16,0,1 +7,4096,512,64,0,f16,f16,0,1 +7,4608,512,64,0,f16,f16,0,1 +7,7168,512,64,0,f16,f16,0,1 +7,576,1536,64,0,f16,f16,0,1 +7,1536,1536,64,0,f16,f16,0,1 +7,3072,1536,64,0,f16,f16,0,1 +7,4096,1536,64,0,f16,f16,0,1 +7,4608,1536,64,0,f16,f16,0,1 +7,7168,1536,64,0,f16,f16,0,1 +7,512,2048,64,0,f16,f16,0,1 +7,1536,2048,64,0,f16,f16,0,1 +7,3072,2048,64,0,f16,f16,0,1 +7,4096,2048,64,0,f16,f16,0,1 +7,4608,2048,64,0,f16,f16,0,1 +7,7168,2048,64,0,f16,f16,0,1 +7,512,2304,64,0,f16,f16,0,1 +7,1536,2304,64,0,f16,f16,0,1 +7,3072,2304,64,0,f16,f16,0,1 +7,4096,2304,64,0,f16,f16,0,1 +7,4608,2304,64,0,f16,f16,0,1 +7,7168,2304,64,0,f16,f16,0,1 +7,512,7168,64,0,f16,f16,0,1 +7,1536,7168,64,0,f16,f16,0,1 +7,3072,7168,64,0,f16,f16,0,1 +7,4096,7168,64,0,f16,f16,0,1 +7,4608,7168,64,0,f16,f16,0,1 +7,7168,7168,64,0,f16,f16,0,1 +8,576,256,64,0,f16,f16,0,1 +8,1536,256,64,0,f16,f16,0,1 +8,3072,256,64,0,f16,f16,0,1 +8,4096,256,64,0,f16,f16,0,1 +8,4608,256,64,0,f16,f16,0,1 +8,7168,256,64,0,f16,f16,0,1 +8,576,512,64,0,f16,f16,0,1 +8,1536,512,64,0,f16,f16,0,1 +8,3072,512,64,0,f16,f16,0,1 +8,4096,512,64,0,f16,f16,0,1 +8,4608,512,64,0,f16,f16,0,1 +8,7168,512,64,0,f16,f16,0,1 +8,576,1536,64,0,f16,f16,0,1 +8,1536,1536,64,0,f16,f16,0,1 +8,3072,1536,64,0,f16,f16,0,1 +8,4096,1536,64,0,f16,f16,0,1 +8,4608,1536,64,0,f16,f16,0,1 +8,7168,1536,64,0,f16,f16,0,1 +8,512,2048,64,0,f16,f16,0,1 +8,1536,2048,64,0,f16,f16,0,1 +8,3072,2048,64,0,f16,f16,0,1 +8,4096,2048,64,0,f16,f16,0,1 +8,4608,2048,64,0,f16,f16,0,1 +8,7168,2048,64,0,f16,f16,0,1 +8,512,2304,64,0,f16,f16,0,1 +8,1536,2304,64,0,f16,f16,0,1 +8,3072,2304,64,0,f16,f16,0,1 +8,4096,2304,64,0,f16,f16,0,1 +8,4608,2304,64,0,f16,f16,0,1 +8,7168,2304,64,0,f16,f16,0,1 +8,512,7168,64,0,f16,f16,0,1 +8,1536,7168,64,0,f16,f16,0,1 +8,3072,7168,64,0,f16,f16,0,1 +8,4096,7168,64,0,f16,f16,0,1 +8,4608,7168,64,0,f16,f16,0,1 +8,7168,7168,64,0,f16,f16,0,1 +9,576,256,64,0,f16,f16,0,1 +9,1536,256,64,0,f16,f16,0,1 +9,3072,256,64,0,f16,f16,0,1 +9,4096,256,64,0,f16,f16,0,1 +9,4608,256,64,0,f16,f16,0,1 +9,7168,256,64,0,f16,f16,0,1 +9,576,512,64,0,f16,f16,0,1 +9,1536,512,64,0,f16,f16,0,1 +9,3072,512,64,0,f16,f16,0,1 +9,4096,512,64,0,f16,f16,0,1 +9,4608,512,64,0,f16,f16,0,1 +9,7168,512,64,0,f16,f16,0,1 +9,576,1536,64,0,f16,f16,0,1 +9,1536,1536,64,0,f16,f16,0,1 +9,3072,1536,64,0,f16,f16,0,1 +9,4096,1536,64,0,f16,f16,0,1 +9,4608,1536,64,0,f16,f16,0,1 +9,7168,1536,64,0,f16,f16,0,1 +9,512,2048,64,0,f16,f16,0,1 +9,1536,2048,64,0,f16,f16,0,1 +9,3072,2048,64,0,f16,f16,0,1 +9,4096,2048,64,0,f16,f16,0,1 +9,4608,2048,64,0,f16,f16,0,1 +9,7168,2048,64,0,f16,f16,0,1 +9,512,2304,64,0,f16,f16,0,1 +9,1536,2304,64,0,f16,f16,0,1 +9,3072,2304,64,0,f16,f16,0,1 +9,4096,2304,64,0,f16,f16,0,1 +9,4608,2304,64,0,f16,f16,0,1 +9,7168,2304,64,0,f16,f16,0,1 +9,512,7168,64,0,f16,f16,0,1 +9,1536,7168,64,0,f16,f16,0,1 +9,3072,7168,64,0,f16,f16,0,1 +9,4096,7168,64,0,f16,f16,0,1 +9,4608,7168,64,0,f16,f16,0,1 +9,7168,7168,64,0,f16,f16,0,1 +10,576,256,64,0,f16,f16,0,1 +10,1536,256,64,0,f16,f16,0,1 +10,3072,256,64,0,f16,f16,0,1 +10,4096,256,64,0,f16,f16,0,1 +10,4608,256,64,0,f16,f16,0,1 +10,7168,256,64,0,f16,f16,0,1 +10,576,512,64,0,f16,f16,0,1 +10,1536,512,64,0,f16,f16,0,1 +10,3072,512,64,0,f16,f16,0,1 +10,4096,512,64,0,f16,f16,0,1 +10,4608,512,64,0,f16,f16,0,1 +10,7168,512,64,0,f16,f16,0,1 +10,576,1536,64,0,f16,f16,0,1 +10,1536,1536,64,0,f16,f16,0,1 +10,3072,1536,64,0,f16,f16,0,1 +10,4096,1536,64,0,f16,f16,0,1 +10,4608,1536,64,0,f16,f16,0,1 +10,7168,1536,64,0,f16,f16,0,1 +10,512,2048,64,0,f16,f16,0,1 +10,1536,2048,64,0,f16,f16,0,1 +10,3072,2048,64,0,f16,f16,0,1 +10,4096,2048,64,0,f16,f16,0,1 +10,4608,2048,64,0,f16,f16,0,1 +10,7168,2048,64,0,f16,f16,0,1 +10,512,2304,64,0,f16,f16,0,1 +10,1536,2304,64,0,f16,f16,0,1 +10,3072,2304,64,0,f16,f16,0,1 +10,4096,2304,64,0,f16,f16,0,1 +10,4608,2304,64,0,f16,f16,0,1 +10,7168,2304,64,0,f16,f16,0,1 +10,512,7168,64,0,f16,f16,0,1 +10,1536,7168,64,0,f16,f16,0,1 +10,3072,7168,64,0,f16,f16,0,1 +10,4096,7168,64,0,f16,f16,0,1 +10,4608,7168,64,0,f16,f16,0,1 +10,7168,7168,64,0,f16,f16,0,1 +11,576,256,64,0,f16,f16,0,1 +11,1536,256,64,0,f16,f16,0,1 +11,3072,256,64,0,f16,f16,0,1 +11,4096,256,64,0,f16,f16,0,1 +11,4608,256,64,0,f16,f16,0,1 +11,7168,256,64,0,f16,f16,0,1 +11,576,512,64,0,f16,f16,0,1 +11,1536,512,64,0,f16,f16,0,1 +11,3072,512,64,0,f16,f16,0,1 +11,4096,512,64,0,f16,f16,0,1 +11,4608,512,64,0,f16,f16,0,1 +11,7168,512,64,0,f16,f16,0,1 +11,576,1536,64,0,f16,f16,0,1 +11,1536,1536,64,0,f16,f16,0,1 +11,3072,1536,64,0,f16,f16,0,1 +11,4096,1536,64,0,f16,f16,0,1 +11,4608,1536,64,0,f16,f16,0,1 +11,7168,1536,64,0,f16,f16,0,1 +11,512,2048,64,0,f16,f16,0,1 +11,1536,2048,64,0,f16,f16,0,1 +11,3072,2048,64,0,f16,f16,0,1 +11,4096,2048,64,0,f16,f16,0,1 +11,4608,2048,64,0,f16,f16,0,1 +11,7168,2048,64,0,f16,f16,0,1 +11,512,2304,64,0,f16,f16,0,1 +11,1536,2304,64,0,f16,f16,0,1 +11,3072,2304,64,0,f16,f16,0,1 +11,4096,2304,64,0,f16,f16,0,1 +11,4608,2304,64,0,f16,f16,0,1 +11,7168,2304,64,0,f16,f16,0,1 +11,512,7168,64,0,f16,f16,0,1 +11,1536,7168,64,0,f16,f16,0,1 +11,3072,7168,64,0,f16,f16,0,1 +11,4096,7168,64,0,f16,f16,0,1 +11,4608,7168,64,0,f16,f16,0,1 +11,7168,7168,64,0,f16,f16,0,1 +12,576,256,64,0,f16,f16,0,1 +12,1536,256,64,0,f16,f16,0,1 +12,3072,256,64,0,f16,f16,0,1 +12,4096,256,64,0,f16,f16,0,1 +12,4608,256,64,0,f16,f16,0,1 +12,7168,256,64,0,f16,f16,0,1 +12,576,512,64,0,f16,f16,0,1 +12,1536,512,64,0,f16,f16,0,1 +12,3072,512,64,0,f16,f16,0,1 +12,4096,512,64,0,f16,f16,0,1 +12,4608,512,64,0,f16,f16,0,1 +12,7168,512,64,0,f16,f16,0,1 +12,576,1536,64,0,f16,f16,0,1 +12,1536,1536,64,0,f16,f16,0,1 +12,3072,1536,64,0,f16,f16,0,1 +12,4096,1536,64,0,f16,f16,0,1 +12,4608,1536,64,0,f16,f16,0,1 +12,7168,1536,64,0,f16,f16,0,1 +12,512,2048,64,0,f16,f16,0,1 +12,1536,2048,64,0,f16,f16,0,1 +12,3072,2048,64,0,f16,f16,0,1 +12,4096,2048,64,0,f16,f16,0,1 +12,4608,2048,64,0,f16,f16,0,1 +12,7168,2048,64,0,f16,f16,0,1 +12,512,2304,64,0,f16,f16,0,1 +12,1536,2304,64,0,f16,f16,0,1 +12,3072,2304,64,0,f16,f16,0,1 +12,4096,2304,64,0,f16,f16,0,1 +12,4608,2304,64,0,f16,f16,0,1 +12,7168,2304,64,0,f16,f16,0,1 +12,512,7168,64,0,f16,f16,0,1 +12,1536,7168,64,0,f16,f16,0,1 +12,3072,7168,64,0,f16,f16,0,1 +12,4096,7168,64,0,f16,f16,0,1 +12,4608,7168,64,0,f16,f16,0,1 +12,7168,7168,64,0,f16,f16,0,1 +13,576,256,64,0,f16,f16,0,1 +13,1536,256,64,0,f16,f16,0,1 +13,3072,256,64,0,f16,f16,0,1 +13,4096,256,64,0,f16,f16,0,1 +13,4608,256,64,0,f16,f16,0,1 +13,7168,256,64,0,f16,f16,0,1 +13,576,512,64,0,f16,f16,0,1 +13,1536,512,64,0,f16,f16,0,1 +13,3072,512,64,0,f16,f16,0,1 +13,4096,512,64,0,f16,f16,0,1 +13,4608,512,64,0,f16,f16,0,1 +13,7168,512,64,0,f16,f16,0,1 +13,576,1536,64,0,f16,f16,0,1 +13,1536,1536,64,0,f16,f16,0,1 +13,3072,1536,64,0,f16,f16,0,1 +13,4096,1536,64,0,f16,f16,0,1 +13,4608,1536,64,0,f16,f16,0,1 +13,7168,1536,64,0,f16,f16,0,1 +13,512,2048,64,0,f16,f16,0,1 +13,1536,2048,64,0,f16,f16,0,1 +13,3072,2048,64,0,f16,f16,0,1 +13,4096,2048,64,0,f16,f16,0,1 +13,4608,2048,64,0,f16,f16,0,1 +13,7168,2048,64,0,f16,f16,0,1 +13,512,2304,64,0,f16,f16,0,1 +13,1536,2304,64,0,f16,f16,0,1 +13,3072,2304,64,0,f16,f16,0,1 +13,4096,2304,64,0,f16,f16,0,1 +13,4608,2304,64,0,f16,f16,0,1 +13,7168,2304,64,0,f16,f16,0,1 +13,512,7168,64,0,f16,f16,0,1 +13,1536,7168,64,0,f16,f16,0,1 +13,3072,7168,64,0,f16,f16,0,1 +13,4096,7168,64,0,f16,f16,0,1 +13,4608,7168,64,0,f16,f16,0,1 +13,7168,7168,64,0,f16,f16,0,1 +14,576,256,64,0,f16,f16,0,1 +14,1536,256,64,0,f16,f16,0,1 +14,3072,256,64,0,f16,f16,0,1 +14,4096,256,64,0,f16,f16,0,1 +14,4608,256,64,0,f16,f16,0,1 +14,7168,256,64,0,f16,f16,0,1 +14,576,512,64,0,f16,f16,0,1 +14,1536,512,64,0,f16,f16,0,1 +14,3072,512,64,0,f16,f16,0,1 +14,4096,512,64,0,f16,f16,0,1 +14,4608,512,64,0,f16,f16,0,1 +14,7168,512,64,0,f16,f16,0,1 +14,576,1536,64,0,f16,f16,0,1 +14,1536,1536,64,0,f16,f16,0,1 +14,3072,1536,64,0,f16,f16,0,1 +14,4096,1536,64,0,f16,f16,0,1 +14,4608,1536,64,0,f16,f16,0,1 +14,7168,1536,64,0,f16,f16,0,1 +14,512,2048,64,0,f16,f16,0,1 +14,1536,2048,64,0,f16,f16,0,1 +14,3072,2048,64,0,f16,f16,0,1 +14,4096,2048,64,0,f16,f16,0,1 +14,4608,2048,64,0,f16,f16,0,1 +14,7168,2048,64,0,f16,f16,0,1 +14,512,2304,64,0,f16,f16,0,1 +14,1536,2304,64,0,f16,f16,0,1 +14,3072,2304,64,0,f16,f16,0,1 +14,4096,2304,64,0,f16,f16,0,1 +14,4608,2304,64,0,f16,f16,0,1 +14,7168,2304,64,0,f16,f16,0,1 +14,512,7168,64,0,f16,f16,0,1 +14,1536,7168,64,0,f16,f16,0,1 +14,3072,7168,64,0,f16,f16,0,1 +14,4096,7168,64,0,f16,f16,0,1 +14,4608,7168,64,0,f16,f16,0,1 +14,7168,7168,64,0,f16,f16,0,1 +15,576,256,64,0,f16,f16,0,1 +15,1536,256,64,0,f16,f16,0,1 +15,3072,256,64,0,f16,f16,0,1 +15,4096,256,64,0,f16,f16,0,1 +15,4608,256,64,0,f16,f16,0,1 +15,7168,256,64,0,f16,f16,0,1 +15,576,512,64,0,f16,f16,0,1 +15,1536,512,64,0,f16,f16,0,1 +15,3072,512,64,0,f16,f16,0,1 +15,4096,512,64,0,f16,f16,0,1 +15,4608,512,64,0,f16,f16,0,1 +15,7168,512,64,0,f16,f16,0,1 +15,576,1536,64,0,f16,f16,0,1 +15,1536,1536,64,0,f16,f16,0,1 +15,3072,1536,64,0,f16,f16,0,1 +15,4096,1536,64,0,f16,f16,0,1 +15,4608,1536,64,0,f16,f16,0,1 +15,7168,1536,64,0,f16,f16,0,1 +15,512,2048,64,0,f16,f16,0,1 +15,1536,2048,64,0,f16,f16,0,1 +15,3072,2048,64,0,f16,f16,0,1 +15,4096,2048,64,0,f16,f16,0,1 +15,4608,2048,64,0,f16,f16,0,1 +15,7168,2048,64,0,f16,f16,0,1 +15,512,2304,64,0,f16,f16,0,1 +15,1536,2304,64,0,f16,f16,0,1 +15,3072,2304,64,0,f16,f16,0,1 +15,4096,2304,64,0,f16,f16,0,1 +15,4608,2304,64,0,f16,f16,0,1 +15,7168,2304,64,0,f16,f16,0,1 +15,512,7168,64,0,f16,f16,0,1 +15,1536,7168,64,0,f16,f16,0,1 +15,3072,7168,64,0,f16,f16,0,1 +15,4096,7168,64,0,f16,f16,0,1 +15,4608,7168,64,0,f16,f16,0,1 +15,7168,7168,64,0,f16,f16,0,1 +16,576,256,64,0,f16,f16,0,1 +16,1536,256,64,0,f16,f16,0,1 +16,3072,256,64,0,f16,f16,0,1 +16,4096,256,64,0,f16,f16,0,1 +16,4608,256,64,0,f16,f16,0,1 +16,7168,256,64,0,f16,f16,0,1 +16,576,512,64,0,f16,f16,0,1 +16,1536,512,64,0,f16,f16,0,1 +16,3072,512,64,0,f16,f16,0,1 +16,4096,512,64,0,f16,f16,0,1 +16,4608,512,64,0,f16,f16,0,1 +16,7168,512,64,0,f16,f16,0,1 +16,576,1536,64,0,f16,f16,0,1 +16,1536,1536,64,0,f16,f16,0,1 +16,3072,1536,64,0,f16,f16,0,1 +16,4096,1536,64,0,f16,f16,0,1 +16,4608,1536,64,0,f16,f16,0,1 +16,7168,1536,64,0,f16,f16,0,1 +16,512,2048,64,0,f16,f16,0,1 +16,1536,2048,64,0,f16,f16,0,1 +16,3072,2048,64,0,f16,f16,0,1 +16,4096,2048,64,0,f16,f16,0,1 +16,4608,2048,64,0,f16,f16,0,1 +16,7168,2048,64,0,f16,f16,0,1 +16,512,2304,64,0,f16,f16,0,1 +16,1536,2304,64,0,f16,f16,0,1 +16,3072,2304,64,0,f16,f16,0,1 +16,4096,2304,64,0,f16,f16,0,1 +16,4608,2304,64,0,f16,f16,0,1 +16,7168,2304,64,0,f16,f16,0,1 +16,512,7168,64,0,f16,f16,0,1 +16,1536,7168,64,0,f16,f16,0,1 +16,3072,7168,64,0,f16,f16,0,1 +16,4096,7168,64,0,f16,f16,0,1 +16,4608,7168,64,0,f16,f16,0,1 +16,7168,7168,64,0,f16,f16,0,1 +17,576,256,64,0,f16,f16,0,1 +17,1536,256,64,0,f16,f16,0,1 +17,3072,256,64,0,f16,f16,0,1 +17,4096,256,64,0,f16,f16,0,1 +17,4608,256,64,0,f16,f16,0,1 +17,7168,256,64,0,f16,f16,0,1 +17,576,512,64,0,f16,f16,0,1 +17,1536,512,64,0,f16,f16,0,1 +17,3072,512,64,0,f16,f16,0,1 +17,4096,512,64,0,f16,f16,0,1 +17,4608,512,64,0,f16,f16,0,1 +17,7168,512,64,0,f16,f16,0,1 +17,576,1536,64,0,f16,f16,0,1 +17,1536,1536,64,0,f16,f16,0,1 +17,3072,1536,64,0,f16,f16,0,1 +17,4096,1536,64,0,f16,f16,0,1 +17,4608,1536,64,0,f16,f16,0,1 +17,7168,1536,64,0,f16,f16,0,1 +17,512,2048,64,0,f16,f16,0,1 +17,1536,2048,64,0,f16,f16,0,1 +17,3072,2048,64,0,f16,f16,0,1 +17,4096,2048,64,0,f16,f16,0,1 +17,4608,2048,64,0,f16,f16,0,1 +17,7168,2048,64,0,f16,f16,0,1 +17,512,2304,64,0,f16,f16,0,1 +17,1536,2304,64,0,f16,f16,0,1 +17,3072,2304,64,0,f16,f16,0,1 +17,4096,2304,64,0,f16,f16,0,1 +17,4608,2304,64,0,f16,f16,0,1 +17,7168,2304,64,0,f16,f16,0,1 +17,512,7168,64,0,f16,f16,0,1 +17,1536,7168,64,0,f16,f16,0,1 +17,3072,7168,64,0,f16,f16,0,1 +17,4096,7168,64,0,f16,f16,0,1 +17,4608,7168,64,0,f16,f16,0,1 +17,7168,7168,64,0,f16,f16,0,1 +18,576,256,64,0,f16,f16,0,1 +18,1536,256,64,0,f16,f16,0,1 +18,3072,256,64,0,f16,f16,0,1 +18,4096,256,64,0,f16,f16,0,1 +18,4608,256,64,0,f16,f16,0,1 +18,7168,256,64,0,f16,f16,0,1 +18,576,512,64,0,f16,f16,0,1 +18,1536,512,64,0,f16,f16,0,1 +18,3072,512,64,0,f16,f16,0,1 +18,4096,512,64,0,f16,f16,0,1 +18,4608,512,64,0,f16,f16,0,1 +18,7168,512,64,0,f16,f16,0,1 +18,576,1536,64,0,f16,f16,0,1 +18,1536,1536,64,0,f16,f16,0,1 +18,3072,1536,64,0,f16,f16,0,1 +18,4096,1536,64,0,f16,f16,0,1 +18,4608,1536,64,0,f16,f16,0,1 +18,7168,1536,64,0,f16,f16,0,1 +18,512,2048,64,0,f16,f16,0,1 +18,1536,2048,64,0,f16,f16,0,1 +18,3072,2048,64,0,f16,f16,0,1 +18,4096,2048,64,0,f16,f16,0,1 +18,4608,2048,64,0,f16,f16,0,1 +18,7168,2048,64,0,f16,f16,0,1 +18,512,2304,64,0,f16,f16,0,1 +18,1536,2304,64,0,f16,f16,0,1 +18,3072,2304,64,0,f16,f16,0,1 +18,4096,2304,64,0,f16,f16,0,1 +18,4608,2304,64,0,f16,f16,0,1 +18,7168,2304,64,0,f16,f16,0,1 +18,512,7168,64,0,f16,f16,0,1 +18,1536,7168,64,0,f16,f16,0,1 +18,3072,7168,64,0,f16,f16,0,1 +18,4096,7168,64,0,f16,f16,0,1 +18,4608,7168,64,0,f16,f16,0,1 +18,7168,7168,64,0,f16,f16,0,1 +19,576,256,64,0,f16,f16,0,1 +19,1536,256,64,0,f16,f16,0,1 +19,3072,256,64,0,f16,f16,0,1 +19,4096,256,64,0,f16,f16,0,1 +19,4608,256,64,0,f16,f16,0,1 +19,7168,256,64,0,f16,f16,0,1 +19,576,512,64,0,f16,f16,0,1 +19,1536,512,64,0,f16,f16,0,1 +19,3072,512,64,0,f16,f16,0,1 +19,4096,512,64,0,f16,f16,0,1 +19,4608,512,64,0,f16,f16,0,1 +19,7168,512,64,0,f16,f16,0,1 +19,576,1536,64,0,f16,f16,0,1 +19,1536,1536,64,0,f16,f16,0,1 +19,3072,1536,64,0,f16,f16,0,1 +19,4096,1536,64,0,f16,f16,0,1 +19,4608,1536,64,0,f16,f16,0,1 +19,7168,1536,64,0,f16,f16,0,1 +19,512,2048,64,0,f16,f16,0,1 +19,1536,2048,64,0,f16,f16,0,1 +19,3072,2048,64,0,f16,f16,0,1 +19,4096,2048,64,0,f16,f16,0,1 +19,4608,2048,64,0,f16,f16,0,1 +19,7168,2048,64,0,f16,f16,0,1 +19,512,2304,64,0,f16,f16,0,1 +19,1536,2304,64,0,f16,f16,0,1 +19,3072,2304,64,0,f16,f16,0,1 +19,4096,2304,64,0,f16,f16,0,1 +19,4608,2304,64,0,f16,f16,0,1 +19,7168,2304,64,0,f16,f16,0,1 +19,512,7168,64,0,f16,f16,0,1 +19,1536,7168,64,0,f16,f16,0,1 +19,3072,7168,64,0,f16,f16,0,1 +19,4096,7168,64,0,f16,f16,0,1 +19,4608,7168,64,0,f16,f16,0,1 +19,7168,7168,64,0,f16,f16,0,1 +20,576,256,64,0,f16,f16,0,1 +20,1536,256,64,0,f16,f16,0,1 +20,3072,256,64,0,f16,f16,0,1 +20,4096,256,64,0,f16,f16,0,1 +20,4608,256,64,0,f16,f16,0,1 +20,7168,256,64,0,f16,f16,0,1 +20,576,512,64,0,f16,f16,0,1 +20,1536,512,64,0,f16,f16,0,1 +20,3072,512,64,0,f16,f16,0,1 +20,4096,512,64,0,f16,f16,0,1 +20,4608,512,64,0,f16,f16,0,1 +20,7168,512,64,0,f16,f16,0,1 +20,576,1536,64,0,f16,f16,0,1 +20,1536,1536,64,0,f16,f16,0,1 +20,3072,1536,64,0,f16,f16,0,1 +20,4096,1536,64,0,f16,f16,0,1 +20,4608,1536,64,0,f16,f16,0,1 +20,7168,1536,64,0,f16,f16,0,1 +20,512,2048,64,0,f16,f16,0,1 +20,1536,2048,64,0,f16,f16,0,1 +20,3072,2048,64,0,f16,f16,0,1 +20,4096,2048,64,0,f16,f16,0,1 +20,4608,2048,64,0,f16,f16,0,1 +20,7168,2048,64,0,f16,f16,0,1 +20,512,2304,64,0,f16,f16,0,1 +20,1536,2304,64,0,f16,f16,0,1 +20,3072,2304,64,0,f16,f16,0,1 +20,4096,2304,64,0,f16,f16,0,1 +20,4608,2304,64,0,f16,f16,0,1 +20,7168,2304,64,0,f16,f16,0,1 +20,512,7168,64,0,f16,f16,0,1 +20,1536,7168,64,0,f16,f16,0,1 +20,3072,7168,64,0,f16,f16,0,1 +20,4096,7168,64,0,f16,f16,0,1 +20,4608,7168,64,0,f16,f16,0,1 +20,7168,7168,64,0,f16,f16,0,1 +21,576,256,64,0,f16,f16,0,1 +21,1536,256,64,0,f16,f16,0,1 +21,3072,256,64,0,f16,f16,0,1 +21,4096,256,64,0,f16,f16,0,1 +21,4608,256,64,0,f16,f16,0,1 +21,7168,256,64,0,f16,f16,0,1 +21,576,512,64,0,f16,f16,0,1 +21,1536,512,64,0,f16,f16,0,1 +21,3072,512,64,0,f16,f16,0,1 +21,4096,512,64,0,f16,f16,0,1 +21,4608,512,64,0,f16,f16,0,1 +21,7168,512,64,0,f16,f16,0,1 +21,576,1536,64,0,f16,f16,0,1 +21,1536,1536,64,0,f16,f16,0,1 +21,3072,1536,64,0,f16,f16,0,1 +21,4096,1536,64,0,f16,f16,0,1 +21,4608,1536,64,0,f16,f16,0,1 +21,7168,1536,64,0,f16,f16,0,1 +21,512,2048,64,0,f16,f16,0,1 +21,1536,2048,64,0,f16,f16,0,1 +21,3072,2048,64,0,f16,f16,0,1 +21,4096,2048,64,0,f16,f16,0,1 +21,4608,2048,64,0,f16,f16,0,1 +21,7168,2048,64,0,f16,f16,0,1 +21,512,2304,64,0,f16,f16,0,1 +21,1536,2304,64,0,f16,f16,0,1 +21,3072,2304,64,0,f16,f16,0,1 +21,4096,2304,64,0,f16,f16,0,1 +21,4608,2304,64,0,f16,f16,0,1 +21,7168,2304,64,0,f16,f16,0,1 +21,512,7168,64,0,f16,f16,0,1 +21,1536,7168,64,0,f16,f16,0,1 +21,3072,7168,64,0,f16,f16,0,1 +21,4096,7168,64,0,f16,f16,0,1 +21,4608,7168,64,0,f16,f16,0,1 +21,7168,7168,64,0,f16,f16,0,1 +22,576,256,64,0,f16,f16,0,1 +22,1536,256,64,0,f16,f16,0,1 +22,3072,256,64,0,f16,f16,0,1 +22,4096,256,64,0,f16,f16,0,1 +22,4608,256,64,0,f16,f16,0,1 +22,7168,256,64,0,f16,f16,0,1 +22,576,512,64,0,f16,f16,0,1 +22,1536,512,64,0,f16,f16,0,1 +22,3072,512,64,0,f16,f16,0,1 +22,4096,512,64,0,f16,f16,0,1 +22,4608,512,64,0,f16,f16,0,1 +22,7168,512,64,0,f16,f16,0,1 +22,576,1536,64,0,f16,f16,0,1 +22,1536,1536,64,0,f16,f16,0,1 +22,3072,1536,64,0,f16,f16,0,1 +22,4096,1536,64,0,f16,f16,0,1 +22,4608,1536,64,0,f16,f16,0,1 +22,7168,1536,64,0,f16,f16,0,1 +22,512,2048,64,0,f16,f16,0,1 +22,1536,2048,64,0,f16,f16,0,1 +22,3072,2048,64,0,f16,f16,0,1 +22,4096,2048,64,0,f16,f16,0,1 +22,4608,2048,64,0,f16,f16,0,1 +22,7168,2048,64,0,f16,f16,0,1 +22,512,2304,64,0,f16,f16,0,1 +22,1536,2304,64,0,f16,f16,0,1 +22,3072,2304,64,0,f16,f16,0,1 +22,4096,2304,64,0,f16,f16,0,1 +22,4608,2304,64,0,f16,f16,0,1 +22,7168,2304,64,0,f16,f16,0,1 +22,512,7168,64,0,f16,f16,0,1 +22,1536,7168,64,0,f16,f16,0,1 +22,3072,7168,64,0,f16,f16,0,1 +22,4096,7168,64,0,f16,f16,0,1 +22,4608,7168,64,0,f16,f16,0,1 +22,7168,7168,64,0,f16,f16,0,1 +23,576,256,64,0,f16,f16,0,1 +23,1536,256,64,0,f16,f16,0,1 +23,3072,256,64,0,f16,f16,0,1 +23,4096,256,64,0,f16,f16,0,1 +23,4608,256,64,0,f16,f16,0,1 +23,7168,256,64,0,f16,f16,0,1 +23,576,512,64,0,f16,f16,0,1 +23,1536,512,64,0,f16,f16,0,1 +23,3072,512,64,0,f16,f16,0,1 +23,4096,512,64,0,f16,f16,0,1 +23,4608,512,64,0,f16,f16,0,1 +23,7168,512,64,0,f16,f16,0,1 +23,576,1536,64,0,f16,f16,0,1 +23,1536,1536,64,0,f16,f16,0,1 +23,3072,1536,64,0,f16,f16,0,1 +23,4096,1536,64,0,f16,f16,0,1 +23,4608,1536,64,0,f16,f16,0,1 +23,7168,1536,64,0,f16,f16,0,1 +23,512,2048,64,0,f16,f16,0,1 +23,1536,2048,64,0,f16,f16,0,1 +23,3072,2048,64,0,f16,f16,0,1 +23,4096,2048,64,0,f16,f16,0,1 +23,4608,2048,64,0,f16,f16,0,1 +23,7168,2048,64,0,f16,f16,0,1 +23,512,2304,64,0,f16,f16,0,1 +23,1536,2304,64,0,f16,f16,0,1 +23,3072,2304,64,0,f16,f16,0,1 +23,4096,2304,64,0,f16,f16,0,1 +23,4608,2304,64,0,f16,f16,0,1 +23,7168,2304,64,0,f16,f16,0,1 +23,512,7168,64,0,f16,f16,0,1 +23,1536,7168,64,0,f16,f16,0,1 +23,3072,7168,64,0,f16,f16,0,1 +23,4096,7168,64,0,f16,f16,0,1 +23,4608,7168,64,0,f16,f16,0,1 +23,7168,7168,64,0,f16,f16,0,1 +24,576,256,64,0,f16,f16,0,1 +24,1536,256,64,0,f16,f16,0,1 +24,3072,256,64,0,f16,f16,0,1 +24,4096,256,64,0,f16,f16,0,1 +24,4608,256,64,0,f16,f16,0,1 +24,7168,256,64,0,f16,f16,0,1 +24,576,512,64,0,f16,f16,0,1 +24,1536,512,64,0,f16,f16,0,1 +24,3072,512,64,0,f16,f16,0,1 +24,4096,512,64,0,f16,f16,0,1 +24,4608,512,64,0,f16,f16,0,1 +24,7168,512,64,0,f16,f16,0,1 +24,576,1536,64,0,f16,f16,0,1 +24,1536,1536,64,0,f16,f16,0,1 +24,3072,1536,64,0,f16,f16,0,1 +24,4096,1536,64,0,f16,f16,0,1 +24,4608,1536,64,0,f16,f16,0,1 +24,7168,1536,64,0,f16,f16,0,1 +24,512,2048,64,0,f16,f16,0,1 +24,1536,2048,64,0,f16,f16,0,1 +24,3072,2048,64,0,f16,f16,0,1 +24,4096,2048,64,0,f16,f16,0,1 +24,4608,2048,64,0,f16,f16,0,1 +24,7168,2048,64,0,f16,f16,0,1 +24,512,2304,64,0,f16,f16,0,1 +24,1536,2304,64,0,f16,f16,0,1 +24,3072,2304,64,0,f16,f16,0,1 +24,4096,2304,64,0,f16,f16,0,1 +24,4608,2304,64,0,f16,f16,0,1 +24,7168,2304,64,0,f16,f16,0,1 +24,512,7168,64,0,f16,f16,0,1 +24,1536,7168,64,0,f16,f16,0,1 +24,3072,7168,64,0,f16,f16,0,1 +24,4096,7168,64,0,f16,f16,0,1 +24,4608,7168,64,0,f16,f16,0,1 +24,7168,7168,64,0,f16,f16,0,1 +25,576,256,64,0,f16,f16,0,1 +25,1536,256,64,0,f16,f16,0,1 +25,3072,256,64,0,f16,f16,0,1 +25,4096,256,64,0,f16,f16,0,1 +25,4608,256,64,0,f16,f16,0,1 +25,7168,256,64,0,f16,f16,0,1 +25,576,512,64,0,f16,f16,0,1 +25,1536,512,64,0,f16,f16,0,1 +25,3072,512,64,0,f16,f16,0,1 +25,4096,512,64,0,f16,f16,0,1 +25,4608,512,64,0,f16,f16,0,1 +25,7168,512,64,0,f16,f16,0,1 +25,576,1536,64,0,f16,f16,0,1 +25,1536,1536,64,0,f16,f16,0,1 +25,3072,1536,64,0,f16,f16,0,1 +25,4096,1536,64,0,f16,f16,0,1 +25,4608,1536,64,0,f16,f16,0,1 +25,7168,1536,64,0,f16,f16,0,1 +25,512,2048,64,0,f16,f16,0,1 +25,1536,2048,64,0,f16,f16,0,1 +25,3072,2048,64,0,f16,f16,0,1 +25,4096,2048,64,0,f16,f16,0,1 +25,4608,2048,64,0,f16,f16,0,1 +25,7168,2048,64,0,f16,f16,0,1 +25,512,2304,64,0,f16,f16,0,1 +25,1536,2304,64,0,f16,f16,0,1 +25,3072,2304,64,0,f16,f16,0,1 +25,4096,2304,64,0,f16,f16,0,1 +25,4608,2304,64,0,f16,f16,0,1 +25,7168,2304,64,0,f16,f16,0,1 +25,512,7168,64,0,f16,f16,0,1 +25,1536,7168,64,0,f16,f16,0,1 +25,3072,7168,64,0,f16,f16,0,1 +25,4096,7168,64,0,f16,f16,0,1 +25,4608,7168,64,0,f16,f16,0,1 +25,7168,7168,64,0,f16,f16,0,1 +26,576,256,64,0,f16,f16,0,1 +26,1536,256,64,0,f16,f16,0,1 +26,3072,256,64,0,f16,f16,0,1 +26,4096,256,64,0,f16,f16,0,1 +26,4608,256,64,0,f16,f16,0,1 +26,7168,256,64,0,f16,f16,0,1 +26,576,512,64,0,f16,f16,0,1 +26,1536,512,64,0,f16,f16,0,1 +26,3072,512,64,0,f16,f16,0,1 +26,4096,512,64,0,f16,f16,0,1 +26,4608,512,64,0,f16,f16,0,1 +26,7168,512,64,0,f16,f16,0,1 +26,576,1536,64,0,f16,f16,0,1 +26,1536,1536,64,0,f16,f16,0,1 +26,3072,1536,64,0,f16,f16,0,1 +26,4096,1536,64,0,f16,f16,0,1 +26,4608,1536,64,0,f16,f16,0,1 +26,7168,1536,64,0,f16,f16,0,1 +26,512,2048,64,0,f16,f16,0,1 +26,1536,2048,64,0,f16,f16,0,1 +26,3072,2048,64,0,f16,f16,0,1 +26,4096,2048,64,0,f16,f16,0,1 +26,4608,2048,64,0,f16,f16,0,1 +26,7168,2048,64,0,f16,f16,0,1 +26,512,2304,64,0,f16,f16,0,1 +26,1536,2304,64,0,f16,f16,0,1 +26,3072,2304,64,0,f16,f16,0,1 +26,4096,2304,64,0,f16,f16,0,1 +26,4608,2304,64,0,f16,f16,0,1 +26,7168,2304,64,0,f16,f16,0,1 +26,512,7168,64,0,f16,f16,0,1 +26,1536,7168,64,0,f16,f16,0,1 +26,3072,7168,64,0,f16,f16,0,1 +26,4096,7168,64,0,f16,f16,0,1 +26,4608,7168,64,0,f16,f16,0,1 +26,7168,7168,64,0,f16,f16,0,1 +27,576,256,64,0,f16,f16,0,1 +27,1536,256,64,0,f16,f16,0,1 +27,3072,256,64,0,f16,f16,0,1 +27,4096,256,64,0,f16,f16,0,1 +27,4608,256,64,0,f16,f16,0,1 +27,7168,256,64,0,f16,f16,0,1 +27,576,512,64,0,f16,f16,0,1 +27,1536,512,64,0,f16,f16,0,1 +27,3072,512,64,0,f16,f16,0,1 +27,4096,512,64,0,f16,f16,0,1 +27,4608,512,64,0,f16,f16,0,1 +27,7168,512,64,0,f16,f16,0,1 +27,576,1536,64,0,f16,f16,0,1 +27,1536,1536,64,0,f16,f16,0,1 +27,3072,1536,64,0,f16,f16,0,1 +27,4096,1536,64,0,f16,f16,0,1 +27,4608,1536,64,0,f16,f16,0,1 +27,7168,1536,64,0,f16,f16,0,1 +27,512,2048,64,0,f16,f16,0,1 +27,1536,2048,64,0,f16,f16,0,1 +27,3072,2048,64,0,f16,f16,0,1 +27,4096,2048,64,0,f16,f16,0,1 +27,4608,2048,64,0,f16,f16,0,1 +27,7168,2048,64,0,f16,f16,0,1 +27,512,2304,64,0,f16,f16,0,1 +27,1536,2304,64,0,f16,f16,0,1 +27,3072,2304,64,0,f16,f16,0,1 +27,4096,2304,64,0,f16,f16,0,1 +27,4608,2304,64,0,f16,f16,0,1 +27,7168,2304,64,0,f16,f16,0,1 +27,512,7168,64,0,f16,f16,0,1 +27,1536,7168,64,0,f16,f16,0,1 +27,3072,7168,64,0,f16,f16,0,1 +27,4096,7168,64,0,f16,f16,0,1 +27,4608,7168,64,0,f16,f16,0,1 +27,7168,7168,64,0,f16,f16,0,1 +28,576,256,64,0,f16,f16,0,1 +28,1536,256,64,0,f16,f16,0,1 +28,3072,256,64,0,f16,f16,0,1 +28,4096,256,64,0,f16,f16,0,1 +28,4608,256,64,0,f16,f16,0,1 +28,7168,256,64,0,f16,f16,0,1 +28,576,512,64,0,f16,f16,0,1 +28,1536,512,64,0,f16,f16,0,1 +28,3072,512,64,0,f16,f16,0,1 +28,4096,512,64,0,f16,f16,0,1 +28,4608,512,64,0,f16,f16,0,1 +28,7168,512,64,0,f16,f16,0,1 +28,576,1536,64,0,f16,f16,0,1 +28,1536,1536,64,0,f16,f16,0,1 +28,3072,1536,64,0,f16,f16,0,1 +28,4096,1536,64,0,f16,f16,0,1 +28,4608,1536,64,0,f16,f16,0,1 +28,7168,1536,64,0,f16,f16,0,1 +28,512,2048,64,0,f16,f16,0,1 +28,1536,2048,64,0,f16,f16,0,1 +28,3072,2048,64,0,f16,f16,0,1 +28,4096,2048,64,0,f16,f16,0,1 +28,4608,2048,64,0,f16,f16,0,1 +28,7168,2048,64,0,f16,f16,0,1 +28,512,2304,64,0,f16,f16,0,1 +28,1536,2304,64,0,f16,f16,0,1 +28,3072,2304,64,0,f16,f16,0,1 +28,4096,2304,64,0,f16,f16,0,1 +28,4608,2304,64,0,f16,f16,0,1 +28,7168,2304,64,0,f16,f16,0,1 +28,512,7168,64,0,f16,f16,0,1 +28,1536,7168,64,0,f16,f16,0,1 +28,3072,7168,64,0,f16,f16,0,1 +28,4096,7168,64,0,f16,f16,0,1 +28,4608,7168,64,0,f16,f16,0,1 +28,7168,7168,64,0,f16,f16,0,1 +29,576,256,64,0,f16,f16,0,1 +29,1536,256,64,0,f16,f16,0,1 +29,3072,256,64,0,f16,f16,0,1 +29,4096,256,64,0,f16,f16,0,1 +29,4608,256,64,0,f16,f16,0,1 +29,7168,256,64,0,f16,f16,0,1 +29,576,512,64,0,f16,f16,0,1 +29,1536,512,64,0,f16,f16,0,1 +29,3072,512,64,0,f16,f16,0,1 +29,4096,512,64,0,f16,f16,0,1 +29,4608,512,64,0,f16,f16,0,1 +29,7168,512,64,0,f16,f16,0,1 +29,576,1536,64,0,f16,f16,0,1 +29,1536,1536,64,0,f16,f16,0,1 +29,3072,1536,64,0,f16,f16,0,1 +29,4096,1536,64,0,f16,f16,0,1 +29,4608,1536,64,0,f16,f16,0,1 +29,7168,1536,64,0,f16,f16,0,1 +29,512,2048,64,0,f16,f16,0,1 +29,1536,2048,64,0,f16,f16,0,1 +29,3072,2048,64,0,f16,f16,0,1 +29,4096,2048,64,0,f16,f16,0,1 +29,4608,2048,64,0,f16,f16,0,1 +29,7168,2048,64,0,f16,f16,0,1 +29,512,2304,64,0,f16,f16,0,1 +29,1536,2304,64,0,f16,f16,0,1 +29,3072,2304,64,0,f16,f16,0,1 +29,4096,2304,64,0,f16,f16,0,1 +29,4608,2304,64,0,f16,f16,0,1 +29,7168,2304,64,0,f16,f16,0,1 +29,512,7168,64,0,f16,f16,0,1 +29,1536,7168,64,0,f16,f16,0,1 +29,3072,7168,64,0,f16,f16,0,1 +29,4096,7168,64,0,f16,f16,0,1 +29,4608,7168,64,0,f16,f16,0,1 +29,7168,7168,64,0,f16,f16,0,1 +30,576,256,64,0,f16,f16,0,1 +30,1536,256,64,0,f16,f16,0,1 +30,3072,256,64,0,f16,f16,0,1 +30,4096,256,64,0,f16,f16,0,1 +30,4608,256,64,0,f16,f16,0,1 +30,7168,256,64,0,f16,f16,0,1 +30,576,512,64,0,f16,f16,0,1 +30,1536,512,64,0,f16,f16,0,1 +30,3072,512,64,0,f16,f16,0,1 +30,4096,512,64,0,f16,f16,0,1 +30,4608,512,64,0,f16,f16,0,1 +30,7168,512,64,0,f16,f16,0,1 +30,576,1536,64,0,f16,f16,0,1 +30,1536,1536,64,0,f16,f16,0,1 +30,3072,1536,64,0,f16,f16,0,1 +30,4096,1536,64,0,f16,f16,0,1 +30,4608,1536,64,0,f16,f16,0,1 +30,7168,1536,64,0,f16,f16,0,1 +30,512,2048,64,0,f16,f16,0,1 +30,1536,2048,64,0,f16,f16,0,1 +30,3072,2048,64,0,f16,f16,0,1 +30,4096,2048,64,0,f16,f16,0,1 +30,4608,2048,64,0,f16,f16,0,1 +30,7168,2048,64,0,f16,f16,0,1 +30,512,2304,64,0,f16,f16,0,1 +30,1536,2304,64,0,f16,f16,0,1 +30,3072,2304,64,0,f16,f16,0,1 +30,4096,2304,64,0,f16,f16,0,1 +30,4608,2304,64,0,f16,f16,0,1 +30,7168,2304,64,0,f16,f16,0,1 +30,512,7168,64,0,f16,f16,0,1 +30,1536,7168,64,0,f16,f16,0,1 +30,3072,7168,64,0,f16,f16,0,1 +30,4096,7168,64,0,f16,f16,0,1 +30,4608,7168,64,0,f16,f16,0,1 +30,7168,7168,64,0,f16,f16,0,1 +31,576,256,64,0,f16,f16,0,1 +31,1536,256,64,0,f16,f16,0,1 +31,3072,256,64,0,f16,f16,0,1 +31,4096,256,64,0,f16,f16,0,1 +31,4608,256,64,0,f16,f16,0,1 +31,7168,256,64,0,f16,f16,0,1 +31,576,512,64,0,f16,f16,0,1 +31,1536,512,64,0,f16,f16,0,1 +31,3072,512,64,0,f16,f16,0,1 +31,4096,512,64,0,f16,f16,0,1 +31,4608,512,64,0,f16,f16,0,1 +31,7168,512,64,0,f16,f16,0,1 +31,576,1536,64,0,f16,f16,0,1 +31,1536,1536,64,0,f16,f16,0,1 +31,3072,1536,64,0,f16,f16,0,1 +31,4096,1536,64,0,f16,f16,0,1 +31,4608,1536,64,0,f16,f16,0,1 +31,7168,1536,64,0,f16,f16,0,1 +31,512,2048,64,0,f16,f16,0,1 +31,1536,2048,64,0,f16,f16,0,1 +31,3072,2048,64,0,f16,f16,0,1 +31,4096,2048,64,0,f16,f16,0,1 +31,4608,2048,64,0,f16,f16,0,1 +31,7168,2048,64,0,f16,f16,0,1 +31,512,2304,64,0,f16,f16,0,1 +31,1536,2304,64,0,f16,f16,0,1 +31,3072,2304,64,0,f16,f16,0,1 +31,4096,2304,64,0,f16,f16,0,1 +31,4608,2304,64,0,f16,f16,0,1 +31,7168,2304,64,0,f16,f16,0,1 +31,512,7168,64,0,f16,f16,0,1 +31,1536,7168,64,0,f16,f16,0,1 +31,3072,7168,64,0,f16,f16,0,1 +31,4096,7168,64,0,f16,f16,0,1 +31,4608,7168,64,0,f16,f16,0,1 +31,7168,7168,64,0,f16,f16,0,1 +32,576,256,64,0,f16,f16,0,1 +32,1536,256,64,0,f16,f16,0,1 +32,3072,256,64,0,f16,f16,0,1 +32,4096,256,64,0,f16,f16,0,1 +32,4608,256,64,0,f16,f16,0,1 +32,7168,256,64,0,f16,f16,0,1 +32,576,512,64,0,f16,f16,0,1 +32,1536,512,64,0,f16,f16,0,1 +32,3072,512,64,0,f16,f16,0,1 +32,4096,512,64,0,f16,f16,0,1 +32,4608,512,64,0,f16,f16,0,1 +32,7168,512,64,0,f16,f16,0,1 +32,576,1536,64,0,f16,f16,0,1 +32,1536,1536,64,0,f16,f16,0,1 +32,3072,1536,64,0,f16,f16,0,1 +32,4096,1536,64,0,f16,f16,0,1 +32,4608,1536,64,0,f16,f16,0,1 +32,7168,1536,64,0,f16,f16,0,1 +32,512,2048,64,0,f16,f16,0,1 +32,1536,2048,64,0,f16,f16,0,1 +32,3072,2048,64,0,f16,f16,0,1 +32,4096,2048,64,0,f16,f16,0,1 +32,4608,2048,64,0,f16,f16,0,1 +32,7168,2048,64,0,f16,f16,0,1 +32,512,2304,64,0,f16,f16,0,1 +32,1536,2304,64,0,f16,f16,0,1 +32,3072,2304,64,0,f16,f16,0,1 +32,4096,2304,64,0,f16,f16,0,1 +32,4608,2304,64,0,f16,f16,0,1 +32,7168,2304,64,0,f16,f16,0,1 +32,512,7168,64,0,f16,f16,0,1 +32,1536,7168,64,0,f16,f16,0,1 +32,3072,7168,64,0,f16,f16,0,1 +32,4096,7168,64,0,f16,f16,0,1 +32,4608,7168,64,0,f16,f16,0,1 +32,7168,7168,64,0,f16,f16,0,1 +33,576,256,64,0,f16,f16,0,1 +33,1536,256,64,0,f16,f16,0,1 +33,3072,256,64,0,f16,f16,0,1 +33,4096,256,64,0,f16,f16,0,1 +33,4608,256,64,0,f16,f16,0,1 +33,7168,256,64,0,f16,f16,0,1 +33,576,512,64,0,f16,f16,0,1 +33,1536,512,64,0,f16,f16,0,1 +33,3072,512,64,0,f16,f16,0,1 +33,4096,512,64,0,f16,f16,0,1 +33,4608,512,64,0,f16,f16,0,1 +33,7168,512,64,0,f16,f16,0,1 +33,576,1536,64,0,f16,f16,0,1 +33,1536,1536,64,0,f16,f16,0,1 +33,3072,1536,64,0,f16,f16,0,1 +33,4096,1536,64,0,f16,f16,0,1 +33,4608,1536,64,0,f16,f16,0,1 +33,7168,1536,64,0,f16,f16,0,1 +33,512,2048,64,0,f16,f16,0,1 +33,1536,2048,64,0,f16,f16,0,1 +33,3072,2048,64,0,f16,f16,0,1 +33,4096,2048,64,0,f16,f16,0,1 +33,4608,2048,64,0,f16,f16,0,1 +33,7168,2048,64,0,f16,f16,0,1 +33,512,2304,64,0,f16,f16,0,1 +33,1536,2304,64,0,f16,f16,0,1 +33,3072,2304,64,0,f16,f16,0,1 +33,4096,2304,64,0,f16,f16,0,1 +33,4608,2304,64,0,f16,f16,0,1 +33,7168,2304,64,0,f16,f16,0,1 +33,512,7168,64,0,f16,f16,0,1 +33,1536,7168,64,0,f16,f16,0,1 +33,3072,7168,64,0,f16,f16,0,1 +33,4096,7168,64,0,f16,f16,0,1 +33,4608,7168,64,0,f16,f16,0,1 +33,7168,7168,64,0,f16,f16,0,1 +34,576,256,64,0,f16,f16,0,1 +34,1536,256,64,0,f16,f16,0,1 +34,3072,256,64,0,f16,f16,0,1 +34,4096,256,64,0,f16,f16,0,1 +34,4608,256,64,0,f16,f16,0,1 +34,7168,256,64,0,f16,f16,0,1 +34,576,512,64,0,f16,f16,0,1 +34,1536,512,64,0,f16,f16,0,1 +34,3072,512,64,0,f16,f16,0,1 +34,4096,512,64,0,f16,f16,0,1 +34,4608,512,64,0,f16,f16,0,1 +34,7168,512,64,0,f16,f16,0,1 +34,576,1536,64,0,f16,f16,0,1 +34,1536,1536,64,0,f16,f16,0,1 +34,3072,1536,64,0,f16,f16,0,1 +34,4096,1536,64,0,f16,f16,0,1 +34,4608,1536,64,0,f16,f16,0,1 +34,7168,1536,64,0,f16,f16,0,1 +34,512,2048,64,0,f16,f16,0,1 +34,1536,2048,64,0,f16,f16,0,1 +34,3072,2048,64,0,f16,f16,0,1 +34,4096,2048,64,0,f16,f16,0,1 +34,4608,2048,64,0,f16,f16,0,1 +34,7168,2048,64,0,f16,f16,0,1 +34,512,2304,64,0,f16,f16,0,1 +34,1536,2304,64,0,f16,f16,0,1 +34,3072,2304,64,0,f16,f16,0,1 +34,4096,2304,64,0,f16,f16,0,1 +34,4608,2304,64,0,f16,f16,0,1 +34,7168,2304,64,0,f16,f16,0,1 +34,512,7168,64,0,f16,f16,0,1 +34,1536,7168,64,0,f16,f16,0,1 +34,3072,7168,64,0,f16,f16,0,1 +34,4096,7168,64,0,f16,f16,0,1 +34,4608,7168,64,0,f16,f16,0,1 +34,7168,7168,64,0,f16,f16,0,1 +35,576,256,64,0,f16,f16,0,1 +35,1536,256,64,0,f16,f16,0,1 +35,3072,256,64,0,f16,f16,0,1 +35,4096,256,64,0,f16,f16,0,1 +35,4608,256,64,0,f16,f16,0,1 +35,7168,256,64,0,f16,f16,0,1 +35,576,512,64,0,f16,f16,0,1 +35,1536,512,64,0,f16,f16,0,1 +35,3072,512,64,0,f16,f16,0,1 +35,4096,512,64,0,f16,f16,0,1 +35,4608,512,64,0,f16,f16,0,1 +35,7168,512,64,0,f16,f16,0,1 +35,576,1536,64,0,f16,f16,0,1 +35,1536,1536,64,0,f16,f16,0,1 +35,3072,1536,64,0,f16,f16,0,1 +35,4096,1536,64,0,f16,f16,0,1 +35,4608,1536,64,0,f16,f16,0,1 +35,7168,1536,64,0,f16,f16,0,1 +35,512,2048,64,0,f16,f16,0,1 +35,1536,2048,64,0,f16,f16,0,1 +35,3072,2048,64,0,f16,f16,0,1 +35,4096,2048,64,0,f16,f16,0,1 +35,4608,2048,64,0,f16,f16,0,1 +35,7168,2048,64,0,f16,f16,0,1 +35,512,2304,64,0,f16,f16,0,1 +35,1536,2304,64,0,f16,f16,0,1 +35,3072,2304,64,0,f16,f16,0,1 +35,4096,2304,64,0,f16,f16,0,1 +35,4608,2304,64,0,f16,f16,0,1 +35,7168,2304,64,0,f16,f16,0,1 +35,512,7168,64,0,f16,f16,0,1 +35,1536,7168,64,0,f16,f16,0,1 +35,3072,7168,64,0,f16,f16,0,1 +35,4096,7168,64,0,f16,f16,0,1 +35,4608,7168,64,0,f16,f16,0,1 +35,7168,7168,64,0,f16,f16,0,1 +36,576,256,64,0,f16,f16,0,1 +36,1536,256,64,0,f16,f16,0,1 +36,3072,256,64,0,f16,f16,0,1 +36,4096,256,64,0,f16,f16,0,1 +36,4608,256,64,0,f16,f16,0,1 +36,7168,256,64,0,f16,f16,0,1 +36,576,512,64,0,f16,f16,0,1 +36,1536,512,64,0,f16,f16,0,1 +36,3072,512,64,0,f16,f16,0,1 +36,4096,512,64,0,f16,f16,0,1 +36,4608,512,64,0,f16,f16,0,1 +36,7168,512,64,0,f16,f16,0,1 +36,576,1536,64,0,f16,f16,0,1 +36,1536,1536,64,0,f16,f16,0,1 +36,3072,1536,64,0,f16,f16,0,1 +36,4096,1536,64,0,f16,f16,0,1 +36,4608,1536,64,0,f16,f16,0,1 +36,7168,1536,64,0,f16,f16,0,1 +36,512,2048,64,0,f16,f16,0,1 +36,1536,2048,64,0,f16,f16,0,1 +36,3072,2048,64,0,f16,f16,0,1 +36,4096,2048,64,0,f16,f16,0,1 +36,4608,2048,64,0,f16,f16,0,1 +36,7168,2048,64,0,f16,f16,0,1 +36,512,2304,64,0,f16,f16,0,1 +36,1536,2304,64,0,f16,f16,0,1 +36,3072,2304,64,0,f16,f16,0,1 +36,4096,2304,64,0,f16,f16,0,1 +36,4608,2304,64,0,f16,f16,0,1 +36,7168,2304,64,0,f16,f16,0,1 +36,512,7168,64,0,f16,f16,0,1 +36,1536,7168,64,0,f16,f16,0,1 +36,3072,7168,64,0,f16,f16,0,1 +36,4096,7168,64,0,f16,f16,0,1 +36,4608,7168,64,0,f16,f16,0,1 +36,7168,7168,64,0,f16,f16,0,1 +37,576,256,64,0,f16,f16,0,1 +37,1536,256,64,0,f16,f16,0,1 +37,3072,256,64,0,f16,f16,0,1 +37,4096,256,64,0,f16,f16,0,1 +37,4608,256,64,0,f16,f16,0,1 +37,7168,256,64,0,f16,f16,0,1 +37,576,512,64,0,f16,f16,0,1 +37,1536,512,64,0,f16,f16,0,1 +37,3072,512,64,0,f16,f16,0,1 +37,4096,512,64,0,f16,f16,0,1 +37,4608,512,64,0,f16,f16,0,1 +37,7168,512,64,0,f16,f16,0,1 +37,576,1536,64,0,f16,f16,0,1 +37,1536,1536,64,0,f16,f16,0,1 +37,3072,1536,64,0,f16,f16,0,1 +37,4096,1536,64,0,f16,f16,0,1 +37,4608,1536,64,0,f16,f16,0,1 +37,7168,1536,64,0,f16,f16,0,1 +37,512,2048,64,0,f16,f16,0,1 +37,1536,2048,64,0,f16,f16,0,1 +37,3072,2048,64,0,f16,f16,0,1 +37,4096,2048,64,0,f16,f16,0,1 +37,4608,2048,64,0,f16,f16,0,1 +37,7168,2048,64,0,f16,f16,0,1 +37,512,2304,64,0,f16,f16,0,1 +37,1536,2304,64,0,f16,f16,0,1 +37,3072,2304,64,0,f16,f16,0,1 +37,4096,2304,64,0,f16,f16,0,1 +37,4608,2304,64,0,f16,f16,0,1 +37,7168,2304,64,0,f16,f16,0,1 +37,512,7168,64,0,f16,f16,0,1 +37,1536,7168,64,0,f16,f16,0,1 +37,3072,7168,64,0,f16,f16,0,1 +37,4096,7168,64,0,f16,f16,0,1 +37,4608,7168,64,0,f16,f16,0,1 +37,7168,7168,64,0,f16,f16,0,1 +38,576,256,64,0,f16,f16,0,1 +38,1536,256,64,0,f16,f16,0,1 +38,3072,256,64,0,f16,f16,0,1 +38,4096,256,64,0,f16,f16,0,1 +38,4608,256,64,0,f16,f16,0,1 +38,7168,256,64,0,f16,f16,0,1 +38,576,512,64,0,f16,f16,0,1 +38,1536,512,64,0,f16,f16,0,1 +38,3072,512,64,0,f16,f16,0,1 +38,4096,512,64,0,f16,f16,0,1 +38,4608,512,64,0,f16,f16,0,1 +38,7168,512,64,0,f16,f16,0,1 +38,576,1536,64,0,f16,f16,0,1 +38,1536,1536,64,0,f16,f16,0,1 +38,3072,1536,64,0,f16,f16,0,1 +38,4096,1536,64,0,f16,f16,0,1 +38,4608,1536,64,0,f16,f16,0,1 +38,7168,1536,64,0,f16,f16,0,1 +38,512,2048,64,0,f16,f16,0,1 +38,1536,2048,64,0,f16,f16,0,1 +38,3072,2048,64,0,f16,f16,0,1 +38,4096,2048,64,0,f16,f16,0,1 +38,4608,2048,64,0,f16,f16,0,1 +38,7168,2048,64,0,f16,f16,0,1 +38,512,2304,64,0,f16,f16,0,1 +38,1536,2304,64,0,f16,f16,0,1 +38,3072,2304,64,0,f16,f16,0,1 +38,4096,2304,64,0,f16,f16,0,1 +38,4608,2304,64,0,f16,f16,0,1 +38,7168,2304,64,0,f16,f16,0,1 +38,512,7168,64,0,f16,f16,0,1 +38,1536,7168,64,0,f16,f16,0,1 +38,3072,7168,64,0,f16,f16,0,1 +38,4096,7168,64,0,f16,f16,0,1 +38,4608,7168,64,0,f16,f16,0,1 +38,7168,7168,64,0,f16,f16,0,1 +39,576,256,64,0,f16,f16,0,1 +39,1536,256,64,0,f16,f16,0,1 +39,3072,256,64,0,f16,f16,0,1 +39,4096,256,64,0,f16,f16,0,1 +39,4608,256,64,0,f16,f16,0,1 +39,7168,256,64,0,f16,f16,0,1 +39,576,512,64,0,f16,f16,0,1 +39,1536,512,64,0,f16,f16,0,1 +39,3072,512,64,0,f16,f16,0,1 +39,4096,512,64,0,f16,f16,0,1 +39,4608,512,64,0,f16,f16,0,1 +39,7168,512,64,0,f16,f16,0,1 +39,576,1536,64,0,f16,f16,0,1 +39,1536,1536,64,0,f16,f16,0,1 +39,3072,1536,64,0,f16,f16,0,1 +39,4096,1536,64,0,f16,f16,0,1 +39,4608,1536,64,0,f16,f16,0,1 +39,7168,1536,64,0,f16,f16,0,1 +39,512,2048,64,0,f16,f16,0,1 +39,1536,2048,64,0,f16,f16,0,1 +39,3072,2048,64,0,f16,f16,0,1 +39,4096,2048,64,0,f16,f16,0,1 +39,4608,2048,64,0,f16,f16,0,1 +39,7168,2048,64,0,f16,f16,0,1 +39,512,2304,64,0,f16,f16,0,1 +39,1536,2304,64,0,f16,f16,0,1 +39,3072,2304,64,0,f16,f16,0,1 +39,4096,2304,64,0,f16,f16,0,1 +39,4608,2304,64,0,f16,f16,0,1 +39,7168,2304,64,0,f16,f16,0,1 +39,512,7168,64,0,f16,f16,0,1 +39,1536,7168,64,0,f16,f16,0,1 +39,3072,7168,64,0,f16,f16,0,1 +39,4096,7168,64,0,f16,f16,0,1 +39,4608,7168,64,0,f16,f16,0,1 +39,7168,7168,64,0,f16,f16,0,1 +40,576,256,64,0,f16,f16,0,1 +40,1536,256,64,0,f16,f16,0,1 +40,3072,256,64,0,f16,f16,0,1 +40,4096,256,64,0,f16,f16,0,1 +40,4608,256,64,0,f16,f16,0,1 +40,7168,256,64,0,f16,f16,0,1 +40,576,512,64,0,f16,f16,0,1 +40,1536,512,64,0,f16,f16,0,1 +40,3072,512,64,0,f16,f16,0,1 +40,4096,512,64,0,f16,f16,0,1 +40,4608,512,64,0,f16,f16,0,1 +40,7168,512,64,0,f16,f16,0,1 +40,576,1536,64,0,f16,f16,0,1 +40,1536,1536,64,0,f16,f16,0,1 +40,3072,1536,64,0,f16,f16,0,1 +40,4096,1536,64,0,f16,f16,0,1 +40,4608,1536,64,0,f16,f16,0,1 +40,7168,1536,64,0,f16,f16,0,1 +40,512,2048,64,0,f16,f16,0,1 +40,1536,2048,64,0,f16,f16,0,1 +40,3072,2048,64,0,f16,f16,0,1 +40,4096,2048,64,0,f16,f16,0,1 +40,4608,2048,64,0,f16,f16,0,1 +40,7168,2048,64,0,f16,f16,0,1 +40,512,2304,64,0,f16,f16,0,1 +40,1536,2304,64,0,f16,f16,0,1 +40,3072,2304,64,0,f16,f16,0,1 +40,4096,2304,64,0,f16,f16,0,1 +40,4608,2304,64,0,f16,f16,0,1 +40,7168,2304,64,0,f16,f16,0,1 +40,512,7168,64,0,f16,f16,0,1 +40,1536,7168,64,0,f16,f16,0,1 +40,3072,7168,64,0,f16,f16,0,1 +40,4096,7168,64,0,f16,f16,0,1 +40,4608,7168,64,0,f16,f16,0,1 +40,7168,7168,64,0,f16,f16,0,1 +41,576,256,64,0,f16,f16,0,1 +41,1536,256,64,0,f16,f16,0,1 +41,3072,256,64,0,f16,f16,0,1 +41,4096,256,64,0,f16,f16,0,1 +41,4608,256,64,0,f16,f16,0,1 +41,7168,256,64,0,f16,f16,0,1 +41,576,512,64,0,f16,f16,0,1 +41,1536,512,64,0,f16,f16,0,1 +41,3072,512,64,0,f16,f16,0,1 +41,4096,512,64,0,f16,f16,0,1 +41,4608,512,64,0,f16,f16,0,1 +41,7168,512,64,0,f16,f16,0,1 +41,576,1536,64,0,f16,f16,0,1 +41,1536,1536,64,0,f16,f16,0,1 +41,3072,1536,64,0,f16,f16,0,1 +41,4096,1536,64,0,f16,f16,0,1 +41,4608,1536,64,0,f16,f16,0,1 +41,7168,1536,64,0,f16,f16,0,1 +41,512,2048,64,0,f16,f16,0,1 +41,1536,2048,64,0,f16,f16,0,1 +41,3072,2048,64,0,f16,f16,0,1 +41,4096,2048,64,0,f16,f16,0,1 +41,4608,2048,64,0,f16,f16,0,1 +41,7168,2048,64,0,f16,f16,0,1 +41,512,2304,64,0,f16,f16,0,1 +41,1536,2304,64,0,f16,f16,0,1 +41,3072,2304,64,0,f16,f16,0,1 +41,4096,2304,64,0,f16,f16,0,1 +41,4608,2304,64,0,f16,f16,0,1 +41,7168,2304,64,0,f16,f16,0,1 +41,512,7168,64,0,f16,f16,0,1 +41,1536,7168,64,0,f16,f16,0,1 +41,3072,7168,64,0,f16,f16,0,1 +41,4096,7168,64,0,f16,f16,0,1 +41,4608,7168,64,0,f16,f16,0,1 +41,7168,7168,64,0,f16,f16,0,1 +42,576,256,64,0,f16,f16,0,1 +42,1536,256,64,0,f16,f16,0,1 +42,3072,256,64,0,f16,f16,0,1 +42,4096,256,64,0,f16,f16,0,1 +42,4608,256,64,0,f16,f16,0,1 +42,7168,256,64,0,f16,f16,0,1 +42,576,512,64,0,f16,f16,0,1 +42,1536,512,64,0,f16,f16,0,1 +42,3072,512,64,0,f16,f16,0,1 +42,4096,512,64,0,f16,f16,0,1 +42,4608,512,64,0,f16,f16,0,1 +42,7168,512,64,0,f16,f16,0,1 +42,576,1536,64,0,f16,f16,0,1 +42,1536,1536,64,0,f16,f16,0,1 +42,3072,1536,64,0,f16,f16,0,1 +42,4096,1536,64,0,f16,f16,0,1 +42,4608,1536,64,0,f16,f16,0,1 +42,7168,1536,64,0,f16,f16,0,1 +42,512,2048,64,0,f16,f16,0,1 +42,1536,2048,64,0,f16,f16,0,1 +42,3072,2048,64,0,f16,f16,0,1 +42,4096,2048,64,0,f16,f16,0,1 +42,4608,2048,64,0,f16,f16,0,1 +42,7168,2048,64,0,f16,f16,0,1 +42,512,2304,64,0,f16,f16,0,1 +42,1536,2304,64,0,f16,f16,0,1 +42,3072,2304,64,0,f16,f16,0,1 +42,4096,2304,64,0,f16,f16,0,1 +42,4608,2304,64,0,f16,f16,0,1 +42,7168,2304,64,0,f16,f16,0,1 +42,512,7168,64,0,f16,f16,0,1 +42,1536,7168,64,0,f16,f16,0,1 +42,3072,7168,64,0,f16,f16,0,1 +42,4096,7168,64,0,f16,f16,0,1 +42,4608,7168,64,0,f16,f16,0,1 +42,7168,7168,64,0,f16,f16,0,1 +43,576,256,64,0,f16,f16,0,1 +43,1536,256,64,0,f16,f16,0,1 +43,3072,256,64,0,f16,f16,0,1 +43,4096,256,64,0,f16,f16,0,1 +43,4608,256,64,0,f16,f16,0,1 +43,7168,256,64,0,f16,f16,0,1 +43,576,512,64,0,f16,f16,0,1 +43,1536,512,64,0,f16,f16,0,1 +43,3072,512,64,0,f16,f16,0,1 +43,4096,512,64,0,f16,f16,0,1 +43,4608,512,64,0,f16,f16,0,1 +43,7168,512,64,0,f16,f16,0,1 +43,576,1536,64,0,f16,f16,0,1 +43,1536,1536,64,0,f16,f16,0,1 +43,3072,1536,64,0,f16,f16,0,1 +43,4096,1536,64,0,f16,f16,0,1 +43,4608,1536,64,0,f16,f16,0,1 +43,7168,1536,64,0,f16,f16,0,1 +43,512,2048,64,0,f16,f16,0,1 +43,1536,2048,64,0,f16,f16,0,1 +43,3072,2048,64,0,f16,f16,0,1 +43,4096,2048,64,0,f16,f16,0,1 +43,4608,2048,64,0,f16,f16,0,1 +43,7168,2048,64,0,f16,f16,0,1 +43,512,2304,64,0,f16,f16,0,1 +43,1536,2304,64,0,f16,f16,0,1 +43,3072,2304,64,0,f16,f16,0,1 +43,4096,2304,64,0,f16,f16,0,1 +43,4608,2304,64,0,f16,f16,0,1 +43,7168,2304,64,0,f16,f16,0,1 +43,512,7168,64,0,f16,f16,0,1 +43,1536,7168,64,0,f16,f16,0,1 +43,3072,7168,64,0,f16,f16,0,1 +43,4096,7168,64,0,f16,f16,0,1 +43,4608,7168,64,0,f16,f16,0,1 +43,7168,7168,64,0,f16,f16,0,1 +44,576,256,64,0,f16,f16,0,1 +44,1536,256,64,0,f16,f16,0,1 +44,3072,256,64,0,f16,f16,0,1 +44,4096,256,64,0,f16,f16,0,1 +44,4608,256,64,0,f16,f16,0,1 +44,7168,256,64,0,f16,f16,0,1 +44,576,512,64,0,f16,f16,0,1 +44,1536,512,64,0,f16,f16,0,1 +44,3072,512,64,0,f16,f16,0,1 +44,4096,512,64,0,f16,f16,0,1 +44,4608,512,64,0,f16,f16,0,1 +44,7168,512,64,0,f16,f16,0,1 +44,576,1536,64,0,f16,f16,0,1 +44,1536,1536,64,0,f16,f16,0,1 +44,3072,1536,64,0,f16,f16,0,1 +44,4096,1536,64,0,f16,f16,0,1 +44,4608,1536,64,0,f16,f16,0,1 +44,7168,1536,64,0,f16,f16,0,1 +44,512,2048,64,0,f16,f16,0,1 +44,1536,2048,64,0,f16,f16,0,1 +44,3072,2048,64,0,f16,f16,0,1 +44,4096,2048,64,0,f16,f16,0,1 +44,4608,2048,64,0,f16,f16,0,1 +44,7168,2048,64,0,f16,f16,0,1 +44,512,2304,64,0,f16,f16,0,1 +44,1536,2304,64,0,f16,f16,0,1 +44,3072,2304,64,0,f16,f16,0,1 +44,4096,2304,64,0,f16,f16,0,1 +44,4608,2304,64,0,f16,f16,0,1 +44,7168,2304,64,0,f16,f16,0,1 +44,512,7168,64,0,f16,f16,0,1 +44,1536,7168,64,0,f16,f16,0,1 +44,3072,7168,64,0,f16,f16,0,1 +44,4096,7168,64,0,f16,f16,0,1 +44,4608,7168,64,0,f16,f16,0,1 +44,7168,7168,64,0,f16,f16,0,1 +45,576,256,64,0,f16,f16,0,1 +45,1536,256,64,0,f16,f16,0,1 +45,3072,256,64,0,f16,f16,0,1 +45,4096,256,64,0,f16,f16,0,1 +45,4608,256,64,0,f16,f16,0,1 +45,7168,256,64,0,f16,f16,0,1 +45,576,512,64,0,f16,f16,0,1 +45,1536,512,64,0,f16,f16,0,1 +45,3072,512,64,0,f16,f16,0,1 +45,4096,512,64,0,f16,f16,0,1 +45,4608,512,64,0,f16,f16,0,1 +45,7168,512,64,0,f16,f16,0,1 +45,576,1536,64,0,f16,f16,0,1 +45,1536,1536,64,0,f16,f16,0,1 +45,3072,1536,64,0,f16,f16,0,1 +45,4096,1536,64,0,f16,f16,0,1 +45,4608,1536,64,0,f16,f16,0,1 +45,7168,1536,64,0,f16,f16,0,1 +45,512,2048,64,0,f16,f16,0,1 +45,1536,2048,64,0,f16,f16,0,1 +45,3072,2048,64,0,f16,f16,0,1 +45,4096,2048,64,0,f16,f16,0,1 +45,4608,2048,64,0,f16,f16,0,1 +45,7168,2048,64,0,f16,f16,0,1 +45,512,2304,64,0,f16,f16,0,1 +45,1536,2304,64,0,f16,f16,0,1 +45,3072,2304,64,0,f16,f16,0,1 +45,4096,2304,64,0,f16,f16,0,1 +45,4608,2304,64,0,f16,f16,0,1 +45,7168,2304,64,0,f16,f16,0,1 +45,512,7168,64,0,f16,f16,0,1 +45,1536,7168,64,0,f16,f16,0,1 +45,3072,7168,64,0,f16,f16,0,1 +45,4096,7168,64,0,f16,f16,0,1 +45,4608,7168,64,0,f16,f16,0,1 +45,7168,7168,64,0,f16,f16,0,1 +46,576,256,64,0,f16,f16,0,1 +46,1536,256,64,0,f16,f16,0,1 +46,3072,256,64,0,f16,f16,0,1 +46,4096,256,64,0,f16,f16,0,1 +46,4608,256,64,0,f16,f16,0,1 +46,7168,256,64,0,f16,f16,0,1 +46,576,512,64,0,f16,f16,0,1 +46,1536,512,64,0,f16,f16,0,1 +46,3072,512,64,0,f16,f16,0,1 +46,4096,512,64,0,f16,f16,0,1 +46,4608,512,64,0,f16,f16,0,1 +46,7168,512,64,0,f16,f16,0,1 +46,576,1536,64,0,f16,f16,0,1 +46,1536,1536,64,0,f16,f16,0,1 +46,3072,1536,64,0,f16,f16,0,1 +46,4096,1536,64,0,f16,f16,0,1 +46,4608,1536,64,0,f16,f16,0,1 +46,7168,1536,64,0,f16,f16,0,1 +46,512,2048,64,0,f16,f16,0,1 +46,1536,2048,64,0,f16,f16,0,1 +46,3072,2048,64,0,f16,f16,0,1 +46,4096,2048,64,0,f16,f16,0,1 +46,4608,2048,64,0,f16,f16,0,1 +46,7168,2048,64,0,f16,f16,0,1 +46,512,2304,64,0,f16,f16,0,1 +46,1536,2304,64,0,f16,f16,0,1 +46,3072,2304,64,0,f16,f16,0,1 +46,4096,2304,64,0,f16,f16,0,1 +46,4608,2304,64,0,f16,f16,0,1 +46,7168,2304,64,0,f16,f16,0,1 +46,512,7168,64,0,f16,f16,0,1 +46,1536,7168,64,0,f16,f16,0,1 +46,3072,7168,64,0,f16,f16,0,1 +46,4096,7168,64,0,f16,f16,0,1 +46,4608,7168,64,0,f16,f16,0,1 +46,7168,7168,64,0,f16,f16,0,1 +47,576,256,64,0,f16,f16,0,1 +47,1536,256,64,0,f16,f16,0,1 +47,3072,256,64,0,f16,f16,0,1 +47,4096,256,64,0,f16,f16,0,1 +47,4608,256,64,0,f16,f16,0,1 +47,7168,256,64,0,f16,f16,0,1 +47,576,512,64,0,f16,f16,0,1 +47,1536,512,64,0,f16,f16,0,1 +47,3072,512,64,0,f16,f16,0,1 +47,4096,512,64,0,f16,f16,0,1 +47,4608,512,64,0,f16,f16,0,1 +47,7168,512,64,0,f16,f16,0,1 +47,576,1536,64,0,f16,f16,0,1 +47,1536,1536,64,0,f16,f16,0,1 +47,3072,1536,64,0,f16,f16,0,1 +47,4096,1536,64,0,f16,f16,0,1 +47,4608,1536,64,0,f16,f16,0,1 +47,7168,1536,64,0,f16,f16,0,1 +47,512,2048,64,0,f16,f16,0,1 +47,1536,2048,64,0,f16,f16,0,1 +47,3072,2048,64,0,f16,f16,0,1 +47,4096,2048,64,0,f16,f16,0,1 +47,4608,2048,64,0,f16,f16,0,1 +47,7168,2048,64,0,f16,f16,0,1 +47,512,2304,64,0,f16,f16,0,1 +47,1536,2304,64,0,f16,f16,0,1 +47,3072,2304,64,0,f16,f16,0,1 +47,4096,2304,64,0,f16,f16,0,1 +47,4608,2304,64,0,f16,f16,0,1 +47,7168,2304,64,0,f16,f16,0,1 +47,512,7168,64,0,f16,f16,0,1 +47,1536,7168,64,0,f16,f16,0,1 +47,3072,7168,64,0,f16,f16,0,1 +47,4096,7168,64,0,f16,f16,0,1 +47,4608,7168,64,0,f16,f16,0,1 +47,7168,7168,64,0,f16,f16,0,1 +48,576,256,64,0,f16,f16,0,1 +48,1536,256,64,0,f16,f16,0,1 +48,3072,256,64,0,f16,f16,0,1 +48,4096,256,64,0,f16,f16,0,1 +48,4608,256,64,0,f16,f16,0,1 +48,7168,256,64,0,f16,f16,0,1 +48,576,512,64,0,f16,f16,0,1 +48,1536,512,64,0,f16,f16,0,1 +48,3072,512,64,0,f16,f16,0,1 +48,4096,512,64,0,f16,f16,0,1 +48,4608,512,64,0,f16,f16,0,1 +48,7168,512,64,0,f16,f16,0,1 +48,576,1536,64,0,f16,f16,0,1 +48,1536,1536,64,0,f16,f16,0,1 +48,3072,1536,64,0,f16,f16,0,1 +48,4096,1536,64,0,f16,f16,0,1 +48,4608,1536,64,0,f16,f16,0,1 +48,7168,1536,64,0,f16,f16,0,1 +48,512,2048,64,0,f16,f16,0,1 +48,1536,2048,64,0,f16,f16,0,1 +48,3072,2048,64,0,f16,f16,0,1 +48,4096,2048,64,0,f16,f16,0,1 +48,4608,2048,64,0,f16,f16,0,1 +48,7168,2048,64,0,f16,f16,0,1 +48,512,2304,64,0,f16,f16,0,1 +48,1536,2304,64,0,f16,f16,0,1 +48,3072,2304,64,0,f16,f16,0,1 +48,4096,2304,64,0,f16,f16,0,1 +48,4608,2304,64,0,f16,f16,0,1 +48,7168,2304,64,0,f16,f16,0,1 +48,512,7168,64,0,f16,f16,0,1 +48,1536,7168,64,0,f16,f16,0,1 +48,3072,7168,64,0,f16,f16,0,1 +48,4096,7168,64,0,f16,f16,0,1 +48,4608,7168,64,0,f16,f16,0,1 +48,7168,7168,64,0,f16,f16,0,1 +49,576,256,64,0,f16,f16,0,1 +49,1536,256,64,0,f16,f16,0,1 +49,3072,256,64,0,f16,f16,0,1 +49,4096,256,64,0,f16,f16,0,1 +49,4608,256,64,0,f16,f16,0,1 +49,7168,256,64,0,f16,f16,0,1 +49,576,512,64,0,f16,f16,0,1 +49,1536,512,64,0,f16,f16,0,1 +49,3072,512,64,0,f16,f16,0,1 +49,4096,512,64,0,f16,f16,0,1 +49,4608,512,64,0,f16,f16,0,1 +49,7168,512,64,0,f16,f16,0,1 +49,576,1536,64,0,f16,f16,0,1 +49,1536,1536,64,0,f16,f16,0,1 +49,3072,1536,64,0,f16,f16,0,1 +49,4096,1536,64,0,f16,f16,0,1 +49,4608,1536,64,0,f16,f16,0,1 +49,7168,1536,64,0,f16,f16,0,1 +49,512,2048,64,0,f16,f16,0,1 +49,1536,2048,64,0,f16,f16,0,1 +49,3072,2048,64,0,f16,f16,0,1 +49,4096,2048,64,0,f16,f16,0,1 +49,4608,2048,64,0,f16,f16,0,1 +49,7168,2048,64,0,f16,f16,0,1 +49,512,2304,64,0,f16,f16,0,1 +49,1536,2304,64,0,f16,f16,0,1 +49,3072,2304,64,0,f16,f16,0,1 +49,4096,2304,64,0,f16,f16,0,1 +49,4608,2304,64,0,f16,f16,0,1 +49,7168,2304,64,0,f16,f16,0,1 +49,512,7168,64,0,f16,f16,0,1 +49,1536,7168,64,0,f16,f16,0,1 +49,3072,7168,64,0,f16,f16,0,1 +49,4096,7168,64,0,f16,f16,0,1 +49,4608,7168,64,0,f16,f16,0,1 +49,7168,7168,64,0,f16,f16,0,1 +50,576,256,64,0,f16,f16,0,1 +50,1536,256,64,0,f16,f16,0,1 +50,3072,256,64,0,f16,f16,0,1 +50,4096,256,64,0,f16,f16,0,1 +50,4608,256,64,0,f16,f16,0,1 +50,7168,256,64,0,f16,f16,0,1 +50,576,512,64,0,f16,f16,0,1 +50,1536,512,64,0,f16,f16,0,1 +50,3072,512,64,0,f16,f16,0,1 +50,4096,512,64,0,f16,f16,0,1 +50,4608,512,64,0,f16,f16,0,1 +50,7168,512,64,0,f16,f16,0,1 +50,576,1536,64,0,f16,f16,0,1 +50,1536,1536,64,0,f16,f16,0,1 +50,3072,1536,64,0,f16,f16,0,1 +50,4096,1536,64,0,f16,f16,0,1 +50,4608,1536,64,0,f16,f16,0,1 +50,7168,1536,64,0,f16,f16,0,1 +50,512,2048,64,0,f16,f16,0,1 +50,1536,2048,64,0,f16,f16,0,1 +50,3072,2048,64,0,f16,f16,0,1 +50,4096,2048,64,0,f16,f16,0,1 +50,4608,2048,64,0,f16,f16,0,1 +50,7168,2048,64,0,f16,f16,0,1 +50,512,2304,64,0,f16,f16,0,1 +50,1536,2304,64,0,f16,f16,0,1 +50,3072,2304,64,0,f16,f16,0,1 +50,4096,2304,64,0,f16,f16,0,1 +50,4608,2304,64,0,f16,f16,0,1 +50,7168,2304,64,0,f16,f16,0,1 +50,512,7168,64,0,f16,f16,0,1 +50,1536,7168,64,0,f16,f16,0,1 +50,3072,7168,64,0,f16,f16,0,1 +50,4096,7168,64,0,f16,f16,0,1 +50,4608,7168,64,0,f16,f16,0,1 +50,7168,7168,64,0,f16,f16,0,1 +51,576,256,64,0,f16,f16,0,1 +51,1536,256,64,0,f16,f16,0,1 +51,3072,256,64,0,f16,f16,0,1 +51,4096,256,64,0,f16,f16,0,1 +51,4608,256,64,0,f16,f16,0,1 +51,7168,256,64,0,f16,f16,0,1 +51,576,512,64,0,f16,f16,0,1 +51,1536,512,64,0,f16,f16,0,1 +51,3072,512,64,0,f16,f16,0,1 +51,4096,512,64,0,f16,f16,0,1 +51,4608,512,64,0,f16,f16,0,1 +51,7168,512,64,0,f16,f16,0,1 +51,576,1536,64,0,f16,f16,0,1 +51,1536,1536,64,0,f16,f16,0,1 +51,3072,1536,64,0,f16,f16,0,1 +51,4096,1536,64,0,f16,f16,0,1 +51,4608,1536,64,0,f16,f16,0,1 +51,7168,1536,64,0,f16,f16,0,1 +51,512,2048,64,0,f16,f16,0,1 +51,1536,2048,64,0,f16,f16,0,1 +51,3072,2048,64,0,f16,f16,0,1 +51,4096,2048,64,0,f16,f16,0,1 +51,4608,2048,64,0,f16,f16,0,1 +51,7168,2048,64,0,f16,f16,0,1 +51,512,2304,64,0,f16,f16,0,1 +51,1536,2304,64,0,f16,f16,0,1 +51,3072,2304,64,0,f16,f16,0,1 +51,4096,2304,64,0,f16,f16,0,1 +51,4608,2304,64,0,f16,f16,0,1 +51,7168,2304,64,0,f16,f16,0,1 +51,512,7168,64,0,f16,f16,0,1 +51,1536,7168,64,0,f16,f16,0,1 +51,3072,7168,64,0,f16,f16,0,1 +51,4096,7168,64,0,f16,f16,0,1 +51,4608,7168,64,0,f16,f16,0,1 +51,7168,7168,64,0,f16,f16,0,1 +52,576,256,64,0,f16,f16,0,1 +52,1536,256,64,0,f16,f16,0,1 +52,3072,256,64,0,f16,f16,0,1 +52,4096,256,64,0,f16,f16,0,1 +52,4608,256,64,0,f16,f16,0,1 +52,7168,256,64,0,f16,f16,0,1 +52,576,512,64,0,f16,f16,0,1 +52,1536,512,64,0,f16,f16,0,1 +52,3072,512,64,0,f16,f16,0,1 +52,4096,512,64,0,f16,f16,0,1 +52,4608,512,64,0,f16,f16,0,1 +52,7168,512,64,0,f16,f16,0,1 +52,576,1536,64,0,f16,f16,0,1 +52,1536,1536,64,0,f16,f16,0,1 +52,3072,1536,64,0,f16,f16,0,1 +52,4096,1536,64,0,f16,f16,0,1 +52,4608,1536,64,0,f16,f16,0,1 +52,7168,1536,64,0,f16,f16,0,1 +52,512,2048,64,0,f16,f16,0,1 +52,1536,2048,64,0,f16,f16,0,1 +52,3072,2048,64,0,f16,f16,0,1 +52,4096,2048,64,0,f16,f16,0,1 +52,4608,2048,64,0,f16,f16,0,1 +52,7168,2048,64,0,f16,f16,0,1 +52,512,2304,64,0,f16,f16,0,1 +52,1536,2304,64,0,f16,f16,0,1 +52,3072,2304,64,0,f16,f16,0,1 +52,4096,2304,64,0,f16,f16,0,1 +52,4608,2304,64,0,f16,f16,0,1 +52,7168,2304,64,0,f16,f16,0,1 +52,512,7168,64,0,f16,f16,0,1 +52,1536,7168,64,0,f16,f16,0,1 +52,3072,7168,64,0,f16,f16,0,1 +52,4096,7168,64,0,f16,f16,0,1 +52,4608,7168,64,0,f16,f16,0,1 +52,7168,7168,64,0,f16,f16,0,1 +53,576,256,64,0,f16,f16,0,1 +53,1536,256,64,0,f16,f16,0,1 +53,3072,256,64,0,f16,f16,0,1 +53,4096,256,64,0,f16,f16,0,1 +53,4608,256,64,0,f16,f16,0,1 +53,7168,256,64,0,f16,f16,0,1 +53,576,512,64,0,f16,f16,0,1 +53,1536,512,64,0,f16,f16,0,1 +53,3072,512,64,0,f16,f16,0,1 +53,4096,512,64,0,f16,f16,0,1 +53,4608,512,64,0,f16,f16,0,1 +53,7168,512,64,0,f16,f16,0,1 +53,576,1536,64,0,f16,f16,0,1 +53,1536,1536,64,0,f16,f16,0,1 +53,3072,1536,64,0,f16,f16,0,1 +53,4096,1536,64,0,f16,f16,0,1 +53,4608,1536,64,0,f16,f16,0,1 +53,7168,1536,64,0,f16,f16,0,1 +53,512,2048,64,0,f16,f16,0,1 +53,1536,2048,64,0,f16,f16,0,1 +53,3072,2048,64,0,f16,f16,0,1 +53,4096,2048,64,0,f16,f16,0,1 +53,4608,2048,64,0,f16,f16,0,1 +53,7168,2048,64,0,f16,f16,0,1 +53,512,2304,64,0,f16,f16,0,1 +53,1536,2304,64,0,f16,f16,0,1 +53,3072,2304,64,0,f16,f16,0,1 +53,4096,2304,64,0,f16,f16,0,1 +53,4608,2304,64,0,f16,f16,0,1 +53,7168,2304,64,0,f16,f16,0,1 +53,512,7168,64,0,f16,f16,0,1 +53,1536,7168,64,0,f16,f16,0,1 +53,3072,7168,64,0,f16,f16,0,1 +53,4096,7168,64,0,f16,f16,0,1 +53,4608,7168,64,0,f16,f16,0,1 +53,7168,7168,64,0,f16,f16,0,1 +54,576,256,64,0,f16,f16,0,1 +54,1536,256,64,0,f16,f16,0,1 +54,3072,256,64,0,f16,f16,0,1 +54,4096,256,64,0,f16,f16,0,1 +54,4608,256,64,0,f16,f16,0,1 +54,7168,256,64,0,f16,f16,0,1 +54,576,512,64,0,f16,f16,0,1 +54,1536,512,64,0,f16,f16,0,1 +54,3072,512,64,0,f16,f16,0,1 +54,4096,512,64,0,f16,f16,0,1 +54,4608,512,64,0,f16,f16,0,1 +54,7168,512,64,0,f16,f16,0,1 +54,576,1536,64,0,f16,f16,0,1 +54,1536,1536,64,0,f16,f16,0,1 +54,3072,1536,64,0,f16,f16,0,1 +54,4096,1536,64,0,f16,f16,0,1 +54,4608,1536,64,0,f16,f16,0,1 +54,7168,1536,64,0,f16,f16,0,1 +54,512,2048,64,0,f16,f16,0,1 +54,1536,2048,64,0,f16,f16,0,1 +54,3072,2048,64,0,f16,f16,0,1 +54,4096,2048,64,0,f16,f16,0,1 +54,4608,2048,64,0,f16,f16,0,1 +54,7168,2048,64,0,f16,f16,0,1 +54,512,2304,64,0,f16,f16,0,1 +54,1536,2304,64,0,f16,f16,0,1 +54,3072,2304,64,0,f16,f16,0,1 +54,4096,2304,64,0,f16,f16,0,1 +54,4608,2304,64,0,f16,f16,0,1 +54,7168,2304,64,0,f16,f16,0,1 +54,512,7168,64,0,f16,f16,0,1 +54,1536,7168,64,0,f16,f16,0,1 +54,3072,7168,64,0,f16,f16,0,1 +54,4096,7168,64,0,f16,f16,0,1 +54,4608,7168,64,0,f16,f16,0,1 +54,7168,7168,64,0,f16,f16,0,1 +55,576,256,64,0,f16,f16,0,1 +55,1536,256,64,0,f16,f16,0,1 +55,3072,256,64,0,f16,f16,0,1 +55,4096,256,64,0,f16,f16,0,1 +55,4608,256,64,0,f16,f16,0,1 +55,7168,256,64,0,f16,f16,0,1 +55,576,512,64,0,f16,f16,0,1 +55,1536,512,64,0,f16,f16,0,1 +55,3072,512,64,0,f16,f16,0,1 +55,4096,512,64,0,f16,f16,0,1 +55,4608,512,64,0,f16,f16,0,1 +55,7168,512,64,0,f16,f16,0,1 +55,576,1536,64,0,f16,f16,0,1 +55,1536,1536,64,0,f16,f16,0,1 +55,3072,1536,64,0,f16,f16,0,1 +55,4096,1536,64,0,f16,f16,0,1 +55,4608,1536,64,0,f16,f16,0,1 +55,7168,1536,64,0,f16,f16,0,1 +55,512,2048,64,0,f16,f16,0,1 +55,1536,2048,64,0,f16,f16,0,1 +55,3072,2048,64,0,f16,f16,0,1 +55,4096,2048,64,0,f16,f16,0,1 +55,4608,2048,64,0,f16,f16,0,1 +55,7168,2048,64,0,f16,f16,0,1 +55,512,2304,64,0,f16,f16,0,1 +55,1536,2304,64,0,f16,f16,0,1 +55,3072,2304,64,0,f16,f16,0,1 +55,4096,2304,64,0,f16,f16,0,1 +55,4608,2304,64,0,f16,f16,0,1 +55,7168,2304,64,0,f16,f16,0,1 +55,512,7168,64,0,f16,f16,0,1 +55,1536,7168,64,0,f16,f16,0,1 +55,3072,7168,64,0,f16,f16,0,1 +55,4096,7168,64,0,f16,f16,0,1 +55,4608,7168,64,0,f16,f16,0,1 +55,7168,7168,64,0,f16,f16,0,1 +56,576,256,64,0,f16,f16,0,1 +56,1536,256,64,0,f16,f16,0,1 +56,3072,256,64,0,f16,f16,0,1 +56,4096,256,64,0,f16,f16,0,1 +56,4608,256,64,0,f16,f16,0,1 +56,7168,256,64,0,f16,f16,0,1 +56,576,512,64,0,f16,f16,0,1 +56,1536,512,64,0,f16,f16,0,1 +56,3072,512,64,0,f16,f16,0,1 +56,4096,512,64,0,f16,f16,0,1 +56,4608,512,64,0,f16,f16,0,1 +56,7168,512,64,0,f16,f16,0,1 +56,576,1536,64,0,f16,f16,0,1 +56,1536,1536,64,0,f16,f16,0,1 +56,3072,1536,64,0,f16,f16,0,1 +56,4096,1536,64,0,f16,f16,0,1 +56,4608,1536,64,0,f16,f16,0,1 +56,7168,1536,64,0,f16,f16,0,1 +56,512,2048,64,0,f16,f16,0,1 +56,1536,2048,64,0,f16,f16,0,1 +56,3072,2048,64,0,f16,f16,0,1 +56,4096,2048,64,0,f16,f16,0,1 +56,4608,2048,64,0,f16,f16,0,1 +56,7168,2048,64,0,f16,f16,0,1 +56,512,2304,64,0,f16,f16,0,1 +56,1536,2304,64,0,f16,f16,0,1 +56,3072,2304,64,0,f16,f16,0,1 +56,4096,2304,64,0,f16,f16,0,1 +56,4608,2304,64,0,f16,f16,0,1 +56,7168,2304,64,0,f16,f16,0,1 +56,512,7168,64,0,f16,f16,0,1 +56,1536,7168,64,0,f16,f16,0,1 +56,3072,7168,64,0,f16,f16,0,1 +56,4096,7168,64,0,f16,f16,0,1 +56,4608,7168,64,0,f16,f16,0,1 +56,7168,7168,64,0,f16,f16,0,1 +57,576,256,64,0,f16,f16,0,1 +57,1536,256,64,0,f16,f16,0,1 +57,3072,256,64,0,f16,f16,0,1 +57,4096,256,64,0,f16,f16,0,1 +57,4608,256,64,0,f16,f16,0,1 +57,7168,256,64,0,f16,f16,0,1 +57,576,512,64,0,f16,f16,0,1 +57,1536,512,64,0,f16,f16,0,1 +57,3072,512,64,0,f16,f16,0,1 +57,4096,512,64,0,f16,f16,0,1 +57,4608,512,64,0,f16,f16,0,1 +57,7168,512,64,0,f16,f16,0,1 +57,576,1536,64,0,f16,f16,0,1 +57,1536,1536,64,0,f16,f16,0,1 +57,3072,1536,64,0,f16,f16,0,1 +57,4096,1536,64,0,f16,f16,0,1 +57,4608,1536,64,0,f16,f16,0,1 +57,7168,1536,64,0,f16,f16,0,1 +57,512,2048,64,0,f16,f16,0,1 +57,1536,2048,64,0,f16,f16,0,1 +57,3072,2048,64,0,f16,f16,0,1 +57,4096,2048,64,0,f16,f16,0,1 +57,4608,2048,64,0,f16,f16,0,1 +57,7168,2048,64,0,f16,f16,0,1 +57,512,2304,64,0,f16,f16,0,1 +57,1536,2304,64,0,f16,f16,0,1 +57,3072,2304,64,0,f16,f16,0,1 +57,4096,2304,64,0,f16,f16,0,1 +57,4608,2304,64,0,f16,f16,0,1 +57,7168,2304,64,0,f16,f16,0,1 +57,512,7168,64,0,f16,f16,0,1 +57,1536,7168,64,0,f16,f16,0,1 +57,3072,7168,64,0,f16,f16,0,1 +57,4096,7168,64,0,f16,f16,0,1 +57,4608,7168,64,0,f16,f16,0,1 +57,7168,7168,64,0,f16,f16,0,1 +58,576,256,64,0,f16,f16,0,1 +58,1536,256,64,0,f16,f16,0,1 +58,3072,256,64,0,f16,f16,0,1 +58,4096,256,64,0,f16,f16,0,1 +58,4608,256,64,0,f16,f16,0,1 +58,7168,256,64,0,f16,f16,0,1 +58,576,512,64,0,f16,f16,0,1 +58,1536,512,64,0,f16,f16,0,1 +58,3072,512,64,0,f16,f16,0,1 +58,4096,512,64,0,f16,f16,0,1 +58,4608,512,64,0,f16,f16,0,1 +58,7168,512,64,0,f16,f16,0,1 +58,576,1536,64,0,f16,f16,0,1 +58,1536,1536,64,0,f16,f16,0,1 +58,3072,1536,64,0,f16,f16,0,1 +58,4096,1536,64,0,f16,f16,0,1 +58,4608,1536,64,0,f16,f16,0,1 +58,7168,1536,64,0,f16,f16,0,1 +58,512,2048,64,0,f16,f16,0,1 +58,1536,2048,64,0,f16,f16,0,1 +58,3072,2048,64,0,f16,f16,0,1 +58,4096,2048,64,0,f16,f16,0,1 +58,4608,2048,64,0,f16,f16,0,1 +58,7168,2048,64,0,f16,f16,0,1 +58,512,2304,64,0,f16,f16,0,1 +58,1536,2304,64,0,f16,f16,0,1 +58,3072,2304,64,0,f16,f16,0,1 +58,4096,2304,64,0,f16,f16,0,1 +58,4608,2304,64,0,f16,f16,0,1 +58,7168,2304,64,0,f16,f16,0,1 +58,512,7168,64,0,f16,f16,0,1 +58,1536,7168,64,0,f16,f16,0,1 +58,3072,7168,64,0,f16,f16,0,1 +58,4096,7168,64,0,f16,f16,0,1 +58,4608,7168,64,0,f16,f16,0,1 +58,7168,7168,64,0,f16,f16,0,1 +59,576,256,64,0,f16,f16,0,1 +59,1536,256,64,0,f16,f16,0,1 +59,3072,256,64,0,f16,f16,0,1 +59,4096,256,64,0,f16,f16,0,1 +59,4608,256,64,0,f16,f16,0,1 +59,7168,256,64,0,f16,f16,0,1 +59,576,512,64,0,f16,f16,0,1 +59,1536,512,64,0,f16,f16,0,1 +59,3072,512,64,0,f16,f16,0,1 +59,4096,512,64,0,f16,f16,0,1 +59,4608,512,64,0,f16,f16,0,1 +59,7168,512,64,0,f16,f16,0,1 +59,576,1536,64,0,f16,f16,0,1 +59,1536,1536,64,0,f16,f16,0,1 +59,3072,1536,64,0,f16,f16,0,1 +59,4096,1536,64,0,f16,f16,0,1 +59,4608,1536,64,0,f16,f16,0,1 +59,7168,1536,64,0,f16,f16,0,1 +59,512,2048,64,0,f16,f16,0,1 +59,1536,2048,64,0,f16,f16,0,1 +59,3072,2048,64,0,f16,f16,0,1 +59,4096,2048,64,0,f16,f16,0,1 +59,4608,2048,64,0,f16,f16,0,1 +59,7168,2048,64,0,f16,f16,0,1 +59,512,2304,64,0,f16,f16,0,1 +59,1536,2304,64,0,f16,f16,0,1 +59,3072,2304,64,0,f16,f16,0,1 +59,4096,2304,64,0,f16,f16,0,1 +59,4608,2304,64,0,f16,f16,0,1 +59,7168,2304,64,0,f16,f16,0,1 +59,512,7168,64,0,f16,f16,0,1 +59,1536,7168,64,0,f16,f16,0,1 +59,3072,7168,64,0,f16,f16,0,1 +59,4096,7168,64,0,f16,f16,0,1 +59,4608,7168,64,0,f16,f16,0,1 +59,7168,7168,64,0,f16,f16,0,1 +60,576,256,64,0,f16,f16,0,1 +60,1536,256,64,0,f16,f16,0,1 +60,3072,256,64,0,f16,f16,0,1 +60,4096,256,64,0,f16,f16,0,1 +60,4608,256,64,0,f16,f16,0,1 +60,7168,256,64,0,f16,f16,0,1 +60,576,512,64,0,f16,f16,0,1 +60,1536,512,64,0,f16,f16,0,1 +60,3072,512,64,0,f16,f16,0,1 +60,4096,512,64,0,f16,f16,0,1 +60,4608,512,64,0,f16,f16,0,1 +60,7168,512,64,0,f16,f16,0,1 +60,576,1536,64,0,f16,f16,0,1 +60,1536,1536,64,0,f16,f16,0,1 +60,3072,1536,64,0,f16,f16,0,1 +60,4096,1536,64,0,f16,f16,0,1 +60,4608,1536,64,0,f16,f16,0,1 +60,7168,1536,64,0,f16,f16,0,1 +60,512,2048,64,0,f16,f16,0,1 +60,1536,2048,64,0,f16,f16,0,1 +60,3072,2048,64,0,f16,f16,0,1 +60,4096,2048,64,0,f16,f16,0,1 +60,4608,2048,64,0,f16,f16,0,1 +60,7168,2048,64,0,f16,f16,0,1 +60,512,2304,64,0,f16,f16,0,1 +60,1536,2304,64,0,f16,f16,0,1 +60,3072,2304,64,0,f16,f16,0,1 +60,4096,2304,64,0,f16,f16,0,1 +60,4608,2304,64,0,f16,f16,0,1 +60,7168,2304,64,0,f16,f16,0,1 +60,512,7168,64,0,f16,f16,0,1 +60,1536,7168,64,0,f16,f16,0,1 +60,3072,7168,64,0,f16,f16,0,1 +60,4096,7168,64,0,f16,f16,0,1 +60,4608,7168,64,0,f16,f16,0,1 +60,7168,7168,64,0,f16,f16,0,1 +61,576,256,64,0,f16,f16,0,1 +61,1536,256,64,0,f16,f16,0,1 +61,3072,256,64,0,f16,f16,0,1 +61,4096,256,64,0,f16,f16,0,1 +61,4608,256,64,0,f16,f16,0,1 +61,7168,256,64,0,f16,f16,0,1 +61,576,512,64,0,f16,f16,0,1 +61,1536,512,64,0,f16,f16,0,1 +61,3072,512,64,0,f16,f16,0,1 +61,4096,512,64,0,f16,f16,0,1 +61,4608,512,64,0,f16,f16,0,1 +61,7168,512,64,0,f16,f16,0,1 +61,576,1536,64,0,f16,f16,0,1 +61,1536,1536,64,0,f16,f16,0,1 +61,3072,1536,64,0,f16,f16,0,1 +61,4096,1536,64,0,f16,f16,0,1 +61,4608,1536,64,0,f16,f16,0,1 +61,7168,1536,64,0,f16,f16,0,1 +61,512,2048,64,0,f16,f16,0,1 +61,1536,2048,64,0,f16,f16,0,1 +61,3072,2048,64,0,f16,f16,0,1 +61,4096,2048,64,0,f16,f16,0,1 +61,4608,2048,64,0,f16,f16,0,1 +61,7168,2048,64,0,f16,f16,0,1 +61,512,2304,64,0,f16,f16,0,1 +61,1536,2304,64,0,f16,f16,0,1 +61,3072,2304,64,0,f16,f16,0,1 +61,4096,2304,64,0,f16,f16,0,1 +61,4608,2304,64,0,f16,f16,0,1 +61,7168,2304,64,0,f16,f16,0,1 +61,512,7168,64,0,f16,f16,0,1 +61,1536,7168,64,0,f16,f16,0,1 +61,3072,7168,64,0,f16,f16,0,1 +61,4096,7168,64,0,f16,f16,0,1 +61,4608,7168,64,0,f16,f16,0,1 +61,7168,7168,64,0,f16,f16,0,1 +62,576,256,64,0,f16,f16,0,1 +62,1536,256,64,0,f16,f16,0,1 +62,3072,256,64,0,f16,f16,0,1 +62,4096,256,64,0,f16,f16,0,1 +62,4608,256,64,0,f16,f16,0,1 +62,7168,256,64,0,f16,f16,0,1 +62,576,512,64,0,f16,f16,0,1 +62,1536,512,64,0,f16,f16,0,1 +62,3072,512,64,0,f16,f16,0,1 +62,4096,512,64,0,f16,f16,0,1 +62,4608,512,64,0,f16,f16,0,1 +62,7168,512,64,0,f16,f16,0,1 +62,576,1536,64,0,f16,f16,0,1 +62,1536,1536,64,0,f16,f16,0,1 +62,3072,1536,64,0,f16,f16,0,1 +62,4096,1536,64,0,f16,f16,0,1 +62,4608,1536,64,0,f16,f16,0,1 +62,7168,1536,64,0,f16,f16,0,1 +62,512,2048,64,0,f16,f16,0,1 +62,1536,2048,64,0,f16,f16,0,1 +62,3072,2048,64,0,f16,f16,0,1 +62,4096,2048,64,0,f16,f16,0,1 +62,4608,2048,64,0,f16,f16,0,1 +62,7168,2048,64,0,f16,f16,0,1 +62,512,2304,64,0,f16,f16,0,1 +62,1536,2304,64,0,f16,f16,0,1 +62,3072,2304,64,0,f16,f16,0,1 +62,4096,2304,64,0,f16,f16,0,1 +62,4608,2304,64,0,f16,f16,0,1 +62,7168,2304,64,0,f16,f16,0,1 +62,512,7168,64,0,f16,f16,0,1 +62,1536,7168,64,0,f16,f16,0,1 +62,3072,7168,64,0,f16,f16,0,1 +62,4096,7168,64,0,f16,f16,0,1 +62,4608,7168,64,0,f16,f16,0,1 +62,7168,7168,64,0,f16,f16,0,1 +63,576,256,64,0,f16,f16,0,1 +63,1536,256,64,0,f16,f16,0,1 +63,3072,256,64,0,f16,f16,0,1 +63,4096,256,64,0,f16,f16,0,1 +63,4608,256,64,0,f16,f16,0,1 +63,7168,256,64,0,f16,f16,0,1 +63,576,512,64,0,f16,f16,0,1 +63,1536,512,64,0,f16,f16,0,1 +63,3072,512,64,0,f16,f16,0,1 +63,4096,512,64,0,f16,f16,0,1 +63,4608,512,64,0,f16,f16,0,1 +63,7168,512,64,0,f16,f16,0,1 +63,576,1536,64,0,f16,f16,0,1 +63,1536,1536,64,0,f16,f16,0,1 +63,3072,1536,64,0,f16,f16,0,1 +63,4096,1536,64,0,f16,f16,0,1 +63,4608,1536,64,0,f16,f16,0,1 +63,7168,1536,64,0,f16,f16,0,1 +63,512,2048,64,0,f16,f16,0,1 +63,1536,2048,64,0,f16,f16,0,1 +63,3072,2048,64,0,f16,f16,0,1 +63,4096,2048,64,0,f16,f16,0,1 +63,4608,2048,64,0,f16,f16,0,1 +63,7168,2048,64,0,f16,f16,0,1 +63,512,2304,64,0,f16,f16,0,1 +63,1536,2304,64,0,f16,f16,0,1 +63,3072,2304,64,0,f16,f16,0,1 +63,4096,2304,64,0,f16,f16,0,1 +63,4608,2304,64,0,f16,f16,0,1 +63,7168,2304,64,0,f16,f16,0,1 +63,512,7168,64,0,f16,f16,0,1 +63,1536,7168,64,0,f16,f16,0,1 +63,3072,7168,64,0,f16,f16,0,1 +63,4096,7168,64,0,f16,f16,0,1 +63,4608,7168,64,0,f16,f16,0,1 +63,7168,7168,64,0,f16,f16,0,1 +64,576,256,64,0,f16,f16,0,1 +64,1536,256,64,0,f16,f16,0,1 +64,3072,256,64,0,f16,f16,0,1 +64,4096,256,64,0,f16,f16,0,1 +64,4608,256,64,0,f16,f16,0,1 +64,7168,256,64,0,f16,f16,0,1 +64,576,512,64,0,f16,f16,0,1 +64,1536,512,64,0,f16,f16,0,1 +64,3072,512,64,0,f16,f16,0,1 +64,4096,512,64,0,f16,f16,0,1 +64,4608,512,64,0,f16,f16,0,1 +64,7168,512,64,0,f16,f16,0,1 +64,576,1536,64,0,f16,f16,0,1 +64,1536,1536,64,0,f16,f16,0,1 +64,3072,1536,64,0,f16,f16,0,1 +64,4096,1536,64,0,f16,f16,0,1 +64,4608,1536,64,0,f16,f16,0,1 +64,7168,1536,64,0,f16,f16,0,1 +64,512,2048,64,0,f16,f16,0,1 +64,1536,2048,64,0,f16,f16,0,1 +64,3072,2048,64,0,f16,f16,0,1 +64,4096,2048,64,0,f16,f16,0,1 +64,4608,2048,64,0,f16,f16,0,1 +64,7168,2048,64,0,f16,f16,0,1 +64,512,2304,64,0,f16,f16,0,1 +64,1536,2304,64,0,f16,f16,0,1 +64,3072,2304,64,0,f16,f16,0,1 +64,4096,2304,64,0,f16,f16,0,1 +64,4608,2304,64,0,f16,f16,0,1 +64,7168,2304,64,0,f16,f16,0,1 +64,512,7168,64,0,f16,f16,0,1 +64,1536,7168,64,0,f16,f16,0,1 +64,3072,7168,64,0,f16,f16,0,1 +64,4096,7168,64,0,f16,f16,0,1 +64,4608,7168,64,0,f16,f16,0,1 +64,7168,7168,64,0,f16,f16,0,1 +65,576,256,64,0,f16,f16,0,1 +65,1536,256,64,0,f16,f16,0,1 +65,3072,256,64,0,f16,f16,0,1 +65,4096,256,64,0,f16,f16,0,1 +65,4608,256,64,0,f16,f16,0,1 +65,7168,256,64,0,f16,f16,0,1 +65,576,512,64,0,f16,f16,0,1 +65,1536,512,64,0,f16,f16,0,1 +65,3072,512,64,0,f16,f16,0,1 +65,4096,512,64,0,f16,f16,0,1 +65,4608,512,64,0,f16,f16,0,1 +65,7168,512,64,0,f16,f16,0,1 +65,576,1536,64,0,f16,f16,0,1 +65,1536,1536,64,0,f16,f16,0,1 +65,3072,1536,64,0,f16,f16,0,1 +65,4096,1536,64,0,f16,f16,0,1 +65,4608,1536,64,0,f16,f16,0,1 +65,7168,1536,64,0,f16,f16,0,1 +65,512,2048,64,0,f16,f16,0,1 +65,1536,2048,64,0,f16,f16,0,1 +65,3072,2048,64,0,f16,f16,0,1 +65,4096,2048,64,0,f16,f16,0,1 +65,4608,2048,64,0,f16,f16,0,1 +65,7168,2048,64,0,f16,f16,0,1 +65,512,2304,64,0,f16,f16,0,1 +65,1536,2304,64,0,f16,f16,0,1 +65,3072,2304,64,0,f16,f16,0,1 +65,4096,2304,64,0,f16,f16,0,1 +65,4608,2304,64,0,f16,f16,0,1 +65,7168,2304,64,0,f16,f16,0,1 +65,512,7168,64,0,f16,f16,0,1 +65,1536,7168,64,0,f16,f16,0,1 +65,3072,7168,64,0,f16,f16,0,1 +65,4096,7168,64,0,f16,f16,0,1 +65,4608,7168,64,0,f16,f16,0,1 +65,7168,7168,64,0,f16,f16,0,1 +66,576,256,64,0,f16,f16,0,1 +66,1536,256,64,0,f16,f16,0,1 +66,3072,256,64,0,f16,f16,0,1 +66,4096,256,64,0,f16,f16,0,1 +66,4608,256,64,0,f16,f16,0,1 +66,7168,256,64,0,f16,f16,0,1 +66,576,512,64,0,f16,f16,0,1 +66,1536,512,64,0,f16,f16,0,1 +66,3072,512,64,0,f16,f16,0,1 +66,4096,512,64,0,f16,f16,0,1 +66,4608,512,64,0,f16,f16,0,1 +66,7168,512,64,0,f16,f16,0,1 +66,576,1536,64,0,f16,f16,0,1 +66,1536,1536,64,0,f16,f16,0,1 +66,3072,1536,64,0,f16,f16,0,1 +66,4096,1536,64,0,f16,f16,0,1 +66,4608,1536,64,0,f16,f16,0,1 +66,7168,1536,64,0,f16,f16,0,1 +66,512,2048,64,0,f16,f16,0,1 +66,1536,2048,64,0,f16,f16,0,1 +66,3072,2048,64,0,f16,f16,0,1 +66,4096,2048,64,0,f16,f16,0,1 +66,4608,2048,64,0,f16,f16,0,1 +66,7168,2048,64,0,f16,f16,0,1 +66,512,2304,64,0,f16,f16,0,1 +66,1536,2304,64,0,f16,f16,0,1 +66,3072,2304,64,0,f16,f16,0,1 +66,4096,2304,64,0,f16,f16,0,1 +66,4608,2304,64,0,f16,f16,0,1 +66,7168,2304,64,0,f16,f16,0,1 +66,512,7168,64,0,f16,f16,0,1 +66,1536,7168,64,0,f16,f16,0,1 +66,3072,7168,64,0,f16,f16,0,1 +66,4096,7168,64,0,f16,f16,0,1 +66,4608,7168,64,0,f16,f16,0,1 +66,7168,7168,64,0,f16,f16,0,1 +67,576,256,64,0,f16,f16,0,1 +67,1536,256,64,0,f16,f16,0,1 +67,3072,256,64,0,f16,f16,0,1 +67,4096,256,64,0,f16,f16,0,1 +67,4608,256,64,0,f16,f16,0,1 +67,7168,256,64,0,f16,f16,0,1 +67,576,512,64,0,f16,f16,0,1 +67,1536,512,64,0,f16,f16,0,1 +67,3072,512,64,0,f16,f16,0,1 +67,4096,512,64,0,f16,f16,0,1 +67,4608,512,64,0,f16,f16,0,1 +67,7168,512,64,0,f16,f16,0,1 +67,576,1536,64,0,f16,f16,0,1 +67,1536,1536,64,0,f16,f16,0,1 +67,3072,1536,64,0,f16,f16,0,1 +67,4096,1536,64,0,f16,f16,0,1 +67,4608,1536,64,0,f16,f16,0,1 +67,7168,1536,64,0,f16,f16,0,1 +67,512,2048,64,0,f16,f16,0,1 +67,1536,2048,64,0,f16,f16,0,1 +67,3072,2048,64,0,f16,f16,0,1 +67,4096,2048,64,0,f16,f16,0,1 +67,4608,2048,64,0,f16,f16,0,1 +67,7168,2048,64,0,f16,f16,0,1 +67,512,2304,64,0,f16,f16,0,1 +67,1536,2304,64,0,f16,f16,0,1 +67,3072,2304,64,0,f16,f16,0,1 +67,4096,2304,64,0,f16,f16,0,1 +67,4608,2304,64,0,f16,f16,0,1 +67,7168,2304,64,0,f16,f16,0,1 +67,512,7168,64,0,f16,f16,0,1 +67,1536,7168,64,0,f16,f16,0,1 +67,3072,7168,64,0,f16,f16,0,1 +67,4096,7168,64,0,f16,f16,0,1 +67,4608,7168,64,0,f16,f16,0,1 +67,7168,7168,64,0,f16,f16,0,1 +68,576,256,64,0,f16,f16,0,1 +68,1536,256,64,0,f16,f16,0,1 +68,3072,256,64,0,f16,f16,0,1 +68,4096,256,64,0,f16,f16,0,1 +68,4608,256,64,0,f16,f16,0,1 +68,7168,256,64,0,f16,f16,0,1 +68,576,512,64,0,f16,f16,0,1 +68,1536,512,64,0,f16,f16,0,1 +68,3072,512,64,0,f16,f16,0,1 +68,4096,512,64,0,f16,f16,0,1 +68,4608,512,64,0,f16,f16,0,1 +68,7168,512,64,0,f16,f16,0,1 +68,576,1536,64,0,f16,f16,0,1 +68,1536,1536,64,0,f16,f16,0,1 +68,3072,1536,64,0,f16,f16,0,1 +68,4096,1536,64,0,f16,f16,0,1 +68,4608,1536,64,0,f16,f16,0,1 +68,7168,1536,64,0,f16,f16,0,1 +68,512,2048,64,0,f16,f16,0,1 +68,1536,2048,64,0,f16,f16,0,1 +68,3072,2048,64,0,f16,f16,0,1 +68,4096,2048,64,0,f16,f16,0,1 +68,4608,2048,64,0,f16,f16,0,1 +68,7168,2048,64,0,f16,f16,0,1 +68,512,2304,64,0,f16,f16,0,1 +68,1536,2304,64,0,f16,f16,0,1 +68,3072,2304,64,0,f16,f16,0,1 +68,4096,2304,64,0,f16,f16,0,1 +68,4608,2304,64,0,f16,f16,0,1 +68,7168,2304,64,0,f16,f16,0,1 +68,512,7168,64,0,f16,f16,0,1 +68,1536,7168,64,0,f16,f16,0,1 +68,3072,7168,64,0,f16,f16,0,1 +68,4096,7168,64,0,f16,f16,0,1 +68,4608,7168,64,0,f16,f16,0,1 +68,7168,7168,64,0,f16,f16,0,1 +69,576,256,64,0,f16,f16,0,1 +69,1536,256,64,0,f16,f16,0,1 +69,3072,256,64,0,f16,f16,0,1 +69,4096,256,64,0,f16,f16,0,1 +69,4608,256,64,0,f16,f16,0,1 +69,7168,256,64,0,f16,f16,0,1 +69,576,512,64,0,f16,f16,0,1 +69,1536,512,64,0,f16,f16,0,1 +69,3072,512,64,0,f16,f16,0,1 +69,4096,512,64,0,f16,f16,0,1 +69,4608,512,64,0,f16,f16,0,1 +69,7168,512,64,0,f16,f16,0,1 +69,576,1536,64,0,f16,f16,0,1 +69,1536,1536,64,0,f16,f16,0,1 +69,3072,1536,64,0,f16,f16,0,1 +69,4096,1536,64,0,f16,f16,0,1 +69,4608,1536,64,0,f16,f16,0,1 +69,7168,1536,64,0,f16,f16,0,1 +69,512,2048,64,0,f16,f16,0,1 +69,1536,2048,64,0,f16,f16,0,1 +69,3072,2048,64,0,f16,f16,0,1 +69,4096,2048,64,0,f16,f16,0,1 +69,4608,2048,64,0,f16,f16,0,1 +69,7168,2048,64,0,f16,f16,0,1 +69,512,2304,64,0,f16,f16,0,1 +69,1536,2304,64,0,f16,f16,0,1 +69,3072,2304,64,0,f16,f16,0,1 +69,4096,2304,64,0,f16,f16,0,1 +69,4608,2304,64,0,f16,f16,0,1 +69,7168,2304,64,0,f16,f16,0,1 +69,512,7168,64,0,f16,f16,0,1 +69,1536,7168,64,0,f16,f16,0,1 +69,3072,7168,64,0,f16,f16,0,1 +69,4096,7168,64,0,f16,f16,0,1 +69,4608,7168,64,0,f16,f16,0,1 +69,7168,7168,64,0,f16,f16,0,1 +70,576,256,64,0,f16,f16,0,1 +70,1536,256,64,0,f16,f16,0,1 +70,3072,256,64,0,f16,f16,0,1 +70,4096,256,64,0,f16,f16,0,1 +70,4608,256,64,0,f16,f16,0,1 +70,7168,256,64,0,f16,f16,0,1 +70,576,512,64,0,f16,f16,0,1 +70,1536,512,64,0,f16,f16,0,1 +70,3072,512,64,0,f16,f16,0,1 +70,4096,512,64,0,f16,f16,0,1 +70,4608,512,64,0,f16,f16,0,1 +70,7168,512,64,0,f16,f16,0,1 +70,576,1536,64,0,f16,f16,0,1 +70,1536,1536,64,0,f16,f16,0,1 +70,3072,1536,64,0,f16,f16,0,1 +70,4096,1536,64,0,f16,f16,0,1 +70,4608,1536,64,0,f16,f16,0,1 +70,7168,1536,64,0,f16,f16,0,1 +70,512,2048,64,0,f16,f16,0,1 +70,1536,2048,64,0,f16,f16,0,1 +70,3072,2048,64,0,f16,f16,0,1 +70,4096,2048,64,0,f16,f16,0,1 +70,4608,2048,64,0,f16,f16,0,1 +70,7168,2048,64,0,f16,f16,0,1 +70,512,2304,64,0,f16,f16,0,1 +70,1536,2304,64,0,f16,f16,0,1 +70,3072,2304,64,0,f16,f16,0,1 +70,4096,2304,64,0,f16,f16,0,1 +70,4608,2304,64,0,f16,f16,0,1 +70,7168,2304,64,0,f16,f16,0,1 +70,512,7168,64,0,f16,f16,0,1 +70,1536,7168,64,0,f16,f16,0,1 +70,3072,7168,64,0,f16,f16,0,1 +70,4096,7168,64,0,f16,f16,0,1 +70,4608,7168,64,0,f16,f16,0,1 +70,7168,7168,64,0,f16,f16,0,1 +71,576,256,64,0,f16,f16,0,1 +71,1536,256,64,0,f16,f16,0,1 +71,3072,256,64,0,f16,f16,0,1 +71,4096,256,64,0,f16,f16,0,1 +71,4608,256,64,0,f16,f16,0,1 +71,7168,256,64,0,f16,f16,0,1 +71,576,512,64,0,f16,f16,0,1 +71,1536,512,64,0,f16,f16,0,1 +71,3072,512,64,0,f16,f16,0,1 +71,4096,512,64,0,f16,f16,0,1 +71,4608,512,64,0,f16,f16,0,1 +71,7168,512,64,0,f16,f16,0,1 +71,576,1536,64,0,f16,f16,0,1 +71,1536,1536,64,0,f16,f16,0,1 +71,3072,1536,64,0,f16,f16,0,1 +71,4096,1536,64,0,f16,f16,0,1 +71,4608,1536,64,0,f16,f16,0,1 +71,7168,1536,64,0,f16,f16,0,1 +71,512,2048,64,0,f16,f16,0,1 +71,1536,2048,64,0,f16,f16,0,1 +71,3072,2048,64,0,f16,f16,0,1 +71,4096,2048,64,0,f16,f16,0,1 +71,4608,2048,64,0,f16,f16,0,1 +71,7168,2048,64,0,f16,f16,0,1 +71,512,2304,64,0,f16,f16,0,1 +71,1536,2304,64,0,f16,f16,0,1 +71,3072,2304,64,0,f16,f16,0,1 +71,4096,2304,64,0,f16,f16,0,1 +71,4608,2304,64,0,f16,f16,0,1 +71,7168,2304,64,0,f16,f16,0,1 +71,512,7168,64,0,f16,f16,0,1 +71,1536,7168,64,0,f16,f16,0,1 +71,3072,7168,64,0,f16,f16,0,1 +71,4096,7168,64,0,f16,f16,0,1 +71,4608,7168,64,0,f16,f16,0,1 +71,7168,7168,64,0,f16,f16,0,1 +72,576,256,64,0,f16,f16,0,1 +72,1536,256,64,0,f16,f16,0,1 +72,3072,256,64,0,f16,f16,0,1 +72,4096,256,64,0,f16,f16,0,1 +72,4608,256,64,0,f16,f16,0,1 +72,7168,256,64,0,f16,f16,0,1 +72,576,512,64,0,f16,f16,0,1 +72,1536,512,64,0,f16,f16,0,1 +72,3072,512,64,0,f16,f16,0,1 +72,4096,512,64,0,f16,f16,0,1 +72,4608,512,64,0,f16,f16,0,1 +72,7168,512,64,0,f16,f16,0,1 +72,576,1536,64,0,f16,f16,0,1 +72,1536,1536,64,0,f16,f16,0,1 +72,3072,1536,64,0,f16,f16,0,1 +72,4096,1536,64,0,f16,f16,0,1 +72,4608,1536,64,0,f16,f16,0,1 +72,7168,1536,64,0,f16,f16,0,1 +72,512,2048,64,0,f16,f16,0,1 +72,1536,2048,64,0,f16,f16,0,1 +72,3072,2048,64,0,f16,f16,0,1 +72,4096,2048,64,0,f16,f16,0,1 +72,4608,2048,64,0,f16,f16,0,1 +72,7168,2048,64,0,f16,f16,0,1 +72,512,2304,64,0,f16,f16,0,1 +72,1536,2304,64,0,f16,f16,0,1 +72,3072,2304,64,0,f16,f16,0,1 +72,4096,2304,64,0,f16,f16,0,1 +72,4608,2304,64,0,f16,f16,0,1 +72,7168,2304,64,0,f16,f16,0,1 +72,512,7168,64,0,f16,f16,0,1 +72,1536,7168,64,0,f16,f16,0,1 +72,3072,7168,64,0,f16,f16,0,1 +72,4096,7168,64,0,f16,f16,0,1 +72,4608,7168,64,0,f16,f16,0,1 +72,7168,7168,64,0,f16,f16,0,1 +73,576,256,64,0,f16,f16,0,1 +73,1536,256,64,0,f16,f16,0,1 +73,3072,256,64,0,f16,f16,0,1 +73,4096,256,64,0,f16,f16,0,1 +73,4608,256,64,0,f16,f16,0,1 +73,7168,256,64,0,f16,f16,0,1 +73,576,512,64,0,f16,f16,0,1 +73,1536,512,64,0,f16,f16,0,1 +73,3072,512,64,0,f16,f16,0,1 +73,4096,512,64,0,f16,f16,0,1 +73,4608,512,64,0,f16,f16,0,1 +73,7168,512,64,0,f16,f16,0,1 +73,576,1536,64,0,f16,f16,0,1 +73,1536,1536,64,0,f16,f16,0,1 +73,3072,1536,64,0,f16,f16,0,1 +73,4096,1536,64,0,f16,f16,0,1 +73,4608,1536,64,0,f16,f16,0,1 +73,7168,1536,64,0,f16,f16,0,1 +73,512,2048,64,0,f16,f16,0,1 +73,1536,2048,64,0,f16,f16,0,1 +73,3072,2048,64,0,f16,f16,0,1 +73,4096,2048,64,0,f16,f16,0,1 +73,4608,2048,64,0,f16,f16,0,1 +73,7168,2048,64,0,f16,f16,0,1 +73,512,2304,64,0,f16,f16,0,1 +73,1536,2304,64,0,f16,f16,0,1 +73,3072,2304,64,0,f16,f16,0,1 +73,4096,2304,64,0,f16,f16,0,1 +73,4608,2304,64,0,f16,f16,0,1 +73,7168,2304,64,0,f16,f16,0,1 +73,512,7168,64,0,f16,f16,0,1 +73,1536,7168,64,0,f16,f16,0,1 +73,3072,7168,64,0,f16,f16,0,1 +73,4096,7168,64,0,f16,f16,0,1 +73,4608,7168,64,0,f16,f16,0,1 +73,7168,7168,64,0,f16,f16,0,1 +74,576,256,64,0,f16,f16,0,1 +74,1536,256,64,0,f16,f16,0,1 +74,3072,256,64,0,f16,f16,0,1 +74,4096,256,64,0,f16,f16,0,1 +74,4608,256,64,0,f16,f16,0,1 +74,7168,256,64,0,f16,f16,0,1 +74,576,512,64,0,f16,f16,0,1 +74,1536,512,64,0,f16,f16,0,1 +74,3072,512,64,0,f16,f16,0,1 +74,4096,512,64,0,f16,f16,0,1 +74,4608,512,64,0,f16,f16,0,1 +74,7168,512,64,0,f16,f16,0,1 +74,576,1536,64,0,f16,f16,0,1 +74,1536,1536,64,0,f16,f16,0,1 +74,3072,1536,64,0,f16,f16,0,1 +74,4096,1536,64,0,f16,f16,0,1 +74,4608,1536,64,0,f16,f16,0,1 +74,7168,1536,64,0,f16,f16,0,1 +74,512,2048,64,0,f16,f16,0,1 +74,1536,2048,64,0,f16,f16,0,1 +74,3072,2048,64,0,f16,f16,0,1 +74,4096,2048,64,0,f16,f16,0,1 +74,4608,2048,64,0,f16,f16,0,1 +74,7168,2048,64,0,f16,f16,0,1 +74,512,2304,64,0,f16,f16,0,1 +74,1536,2304,64,0,f16,f16,0,1 +74,3072,2304,64,0,f16,f16,0,1 +74,4096,2304,64,0,f16,f16,0,1 +74,4608,2304,64,0,f16,f16,0,1 +74,7168,2304,64,0,f16,f16,0,1 +74,512,7168,64,0,f16,f16,0,1 +74,1536,7168,64,0,f16,f16,0,1 +74,3072,7168,64,0,f16,f16,0,1 +74,4096,7168,64,0,f16,f16,0,1 +74,4608,7168,64,0,f16,f16,0,1 +74,7168,7168,64,0,f16,f16,0,1 +75,576,256,64,0,f16,f16,0,1 +75,1536,256,64,0,f16,f16,0,1 +75,3072,256,64,0,f16,f16,0,1 +75,4096,256,64,0,f16,f16,0,1 +75,4608,256,64,0,f16,f16,0,1 +75,7168,256,64,0,f16,f16,0,1 +75,576,512,64,0,f16,f16,0,1 +75,1536,512,64,0,f16,f16,0,1 +75,3072,512,64,0,f16,f16,0,1 +75,4096,512,64,0,f16,f16,0,1 +75,4608,512,64,0,f16,f16,0,1 +75,7168,512,64,0,f16,f16,0,1 +75,576,1536,64,0,f16,f16,0,1 +75,1536,1536,64,0,f16,f16,0,1 +75,3072,1536,64,0,f16,f16,0,1 +75,4096,1536,64,0,f16,f16,0,1 +75,4608,1536,64,0,f16,f16,0,1 +75,7168,1536,64,0,f16,f16,0,1 +75,512,2048,64,0,f16,f16,0,1 +75,1536,2048,64,0,f16,f16,0,1 +75,3072,2048,64,0,f16,f16,0,1 +75,4096,2048,64,0,f16,f16,0,1 +75,4608,2048,64,0,f16,f16,0,1 +75,7168,2048,64,0,f16,f16,0,1 +75,512,2304,64,0,f16,f16,0,1 +75,1536,2304,64,0,f16,f16,0,1 +75,3072,2304,64,0,f16,f16,0,1 +75,4096,2304,64,0,f16,f16,0,1 +75,4608,2304,64,0,f16,f16,0,1 +75,7168,2304,64,0,f16,f16,0,1 +75,512,7168,64,0,f16,f16,0,1 +75,1536,7168,64,0,f16,f16,0,1 +75,3072,7168,64,0,f16,f16,0,1 +75,4096,7168,64,0,f16,f16,0,1 +75,4608,7168,64,0,f16,f16,0,1 +75,7168,7168,64,0,f16,f16,0,1 +76,576,256,64,0,f16,f16,0,1 +76,1536,256,64,0,f16,f16,0,1 +76,3072,256,64,0,f16,f16,0,1 +76,4096,256,64,0,f16,f16,0,1 +76,4608,256,64,0,f16,f16,0,1 +76,7168,256,64,0,f16,f16,0,1 +76,576,512,64,0,f16,f16,0,1 +76,1536,512,64,0,f16,f16,0,1 +76,3072,512,64,0,f16,f16,0,1 +76,4096,512,64,0,f16,f16,0,1 +76,4608,512,64,0,f16,f16,0,1 +76,7168,512,64,0,f16,f16,0,1 +76,576,1536,64,0,f16,f16,0,1 +76,1536,1536,64,0,f16,f16,0,1 +76,3072,1536,64,0,f16,f16,0,1 +76,4096,1536,64,0,f16,f16,0,1 +76,4608,1536,64,0,f16,f16,0,1 +76,7168,1536,64,0,f16,f16,0,1 +76,512,2048,64,0,f16,f16,0,1 +76,1536,2048,64,0,f16,f16,0,1 +76,3072,2048,64,0,f16,f16,0,1 +76,4096,2048,64,0,f16,f16,0,1 +76,4608,2048,64,0,f16,f16,0,1 +76,7168,2048,64,0,f16,f16,0,1 +76,512,2304,64,0,f16,f16,0,1 +76,1536,2304,64,0,f16,f16,0,1 +76,3072,2304,64,0,f16,f16,0,1 +76,4096,2304,64,0,f16,f16,0,1 +76,4608,2304,64,0,f16,f16,0,1 +76,7168,2304,64,0,f16,f16,0,1 +76,512,7168,64,0,f16,f16,0,1 +76,1536,7168,64,0,f16,f16,0,1 +76,3072,7168,64,0,f16,f16,0,1 +76,4096,7168,64,0,f16,f16,0,1 +76,4608,7168,64,0,f16,f16,0,1 +76,7168,7168,64,0,f16,f16,0,1 +77,576,256,64,0,f16,f16,0,1 +77,1536,256,64,0,f16,f16,0,1 +77,3072,256,64,0,f16,f16,0,1 +77,4096,256,64,0,f16,f16,0,1 +77,4608,256,64,0,f16,f16,0,1 +77,7168,256,64,0,f16,f16,0,1 +77,576,512,64,0,f16,f16,0,1 +77,1536,512,64,0,f16,f16,0,1 +77,3072,512,64,0,f16,f16,0,1 +77,4096,512,64,0,f16,f16,0,1 +77,4608,512,64,0,f16,f16,0,1 +77,7168,512,64,0,f16,f16,0,1 +77,576,1536,64,0,f16,f16,0,1 +77,1536,1536,64,0,f16,f16,0,1 +77,3072,1536,64,0,f16,f16,0,1 +77,4096,1536,64,0,f16,f16,0,1 +77,4608,1536,64,0,f16,f16,0,1 +77,7168,1536,64,0,f16,f16,0,1 +77,512,2048,64,0,f16,f16,0,1 +77,1536,2048,64,0,f16,f16,0,1 +77,3072,2048,64,0,f16,f16,0,1 +77,4096,2048,64,0,f16,f16,0,1 +77,4608,2048,64,0,f16,f16,0,1 +77,7168,2048,64,0,f16,f16,0,1 +77,512,2304,64,0,f16,f16,0,1 +77,1536,2304,64,0,f16,f16,0,1 +77,3072,2304,64,0,f16,f16,0,1 +77,4096,2304,64,0,f16,f16,0,1 +77,4608,2304,64,0,f16,f16,0,1 +77,7168,2304,64,0,f16,f16,0,1 +77,512,7168,64,0,f16,f16,0,1 +77,1536,7168,64,0,f16,f16,0,1 +77,3072,7168,64,0,f16,f16,0,1 +77,4096,7168,64,0,f16,f16,0,1 +77,4608,7168,64,0,f16,f16,0,1 +77,7168,7168,64,0,f16,f16,0,1 +78,576,256,64,0,f16,f16,0,1 +78,1536,256,64,0,f16,f16,0,1 +78,3072,256,64,0,f16,f16,0,1 +78,4096,256,64,0,f16,f16,0,1 +78,4608,256,64,0,f16,f16,0,1 +78,7168,256,64,0,f16,f16,0,1 +78,576,512,64,0,f16,f16,0,1 +78,1536,512,64,0,f16,f16,0,1 +78,3072,512,64,0,f16,f16,0,1 +78,4096,512,64,0,f16,f16,0,1 +78,4608,512,64,0,f16,f16,0,1 +78,7168,512,64,0,f16,f16,0,1 +78,576,1536,64,0,f16,f16,0,1 +78,1536,1536,64,0,f16,f16,0,1 +78,3072,1536,64,0,f16,f16,0,1 +78,4096,1536,64,0,f16,f16,0,1 +78,4608,1536,64,0,f16,f16,0,1 +78,7168,1536,64,0,f16,f16,0,1 +78,512,2048,64,0,f16,f16,0,1 +78,1536,2048,64,0,f16,f16,0,1 +78,3072,2048,64,0,f16,f16,0,1 +78,4096,2048,64,0,f16,f16,0,1 +78,4608,2048,64,0,f16,f16,0,1 +78,7168,2048,64,0,f16,f16,0,1 +78,512,2304,64,0,f16,f16,0,1 +78,1536,2304,64,0,f16,f16,0,1 +78,3072,2304,64,0,f16,f16,0,1 +78,4096,2304,64,0,f16,f16,0,1 +78,4608,2304,64,0,f16,f16,0,1 +78,7168,2304,64,0,f16,f16,0,1 +78,512,7168,64,0,f16,f16,0,1 +78,1536,7168,64,0,f16,f16,0,1 +78,3072,7168,64,0,f16,f16,0,1 +78,4096,7168,64,0,f16,f16,0,1 +78,4608,7168,64,0,f16,f16,0,1 +78,7168,7168,64,0,f16,f16,0,1 +79,576,256,64,0,f16,f16,0,1 +79,1536,256,64,0,f16,f16,0,1 +79,3072,256,64,0,f16,f16,0,1 +79,4096,256,64,0,f16,f16,0,1 +79,4608,256,64,0,f16,f16,0,1 +79,7168,256,64,0,f16,f16,0,1 +79,576,512,64,0,f16,f16,0,1 +79,1536,512,64,0,f16,f16,0,1 +79,3072,512,64,0,f16,f16,0,1 +79,4096,512,64,0,f16,f16,0,1 +79,4608,512,64,0,f16,f16,0,1 +79,7168,512,64,0,f16,f16,0,1 +79,576,1536,64,0,f16,f16,0,1 +79,1536,1536,64,0,f16,f16,0,1 +79,3072,1536,64,0,f16,f16,0,1 +79,4096,1536,64,0,f16,f16,0,1 +79,4608,1536,64,0,f16,f16,0,1 +79,7168,1536,64,0,f16,f16,0,1 +79,512,2048,64,0,f16,f16,0,1 +79,1536,2048,64,0,f16,f16,0,1 +79,3072,2048,64,0,f16,f16,0,1 +79,4096,2048,64,0,f16,f16,0,1 +79,4608,2048,64,0,f16,f16,0,1 +79,7168,2048,64,0,f16,f16,0,1 +79,512,2304,64,0,f16,f16,0,1 +79,1536,2304,64,0,f16,f16,0,1 +79,3072,2304,64,0,f16,f16,0,1 +79,4096,2304,64,0,f16,f16,0,1 +79,4608,2304,64,0,f16,f16,0,1 +79,7168,2304,64,0,f16,f16,0,1 +79,512,7168,64,0,f16,f16,0,1 +79,1536,7168,64,0,f16,f16,0,1 +79,3072,7168,64,0,f16,f16,0,1 +79,4096,7168,64,0,f16,f16,0,1 +79,4608,7168,64,0,f16,f16,0,1 +79,7168,7168,64,0,f16,f16,0,1 +80,576,256,64,0,f16,f16,0,1 +80,1536,256,64,0,f16,f16,0,1 +80,3072,256,64,0,f16,f16,0,1 +80,4096,256,64,0,f16,f16,0,1 +80,4608,256,64,0,f16,f16,0,1 +80,7168,256,64,0,f16,f16,0,1 +80,576,512,64,0,f16,f16,0,1 +80,1536,512,64,0,f16,f16,0,1 +80,3072,512,64,0,f16,f16,0,1 +80,4096,512,64,0,f16,f16,0,1 +80,4608,512,64,0,f16,f16,0,1 +80,7168,512,64,0,f16,f16,0,1 +80,576,1536,64,0,f16,f16,0,1 +80,1536,1536,64,0,f16,f16,0,1 +80,3072,1536,64,0,f16,f16,0,1 +80,4096,1536,64,0,f16,f16,0,1 +80,4608,1536,64,0,f16,f16,0,1 +80,7168,1536,64,0,f16,f16,0,1 +80,512,2048,64,0,f16,f16,0,1 +80,1536,2048,64,0,f16,f16,0,1 +80,3072,2048,64,0,f16,f16,0,1 +80,4096,2048,64,0,f16,f16,0,1 +80,4608,2048,64,0,f16,f16,0,1 +80,7168,2048,64,0,f16,f16,0,1 +80,512,2304,64,0,f16,f16,0,1 +80,1536,2304,64,0,f16,f16,0,1 +80,3072,2304,64,0,f16,f16,0,1 +80,4096,2304,64,0,f16,f16,0,1 +80,4608,2304,64,0,f16,f16,0,1 +80,7168,2304,64,0,f16,f16,0,1 +80,512,7168,64,0,f16,f16,0,1 +80,1536,7168,64,0,f16,f16,0,1 +80,3072,7168,64,0,f16,f16,0,1 +80,4096,7168,64,0,f16,f16,0,1 +80,4608,7168,64,0,f16,f16,0,1 +80,7168,7168,64,0,f16,f16,0,1 +81,576,256,64,0,f16,f16,0,1 +81,1536,256,64,0,f16,f16,0,1 +81,3072,256,64,0,f16,f16,0,1 +81,4096,256,64,0,f16,f16,0,1 +81,4608,256,64,0,f16,f16,0,1 +81,7168,256,64,0,f16,f16,0,1 +81,576,512,64,0,f16,f16,0,1 +81,1536,512,64,0,f16,f16,0,1 +81,3072,512,64,0,f16,f16,0,1 +81,4096,512,64,0,f16,f16,0,1 +81,4608,512,64,0,f16,f16,0,1 +81,7168,512,64,0,f16,f16,0,1 +81,576,1536,64,0,f16,f16,0,1 +81,1536,1536,64,0,f16,f16,0,1 +81,3072,1536,64,0,f16,f16,0,1 +81,4096,1536,64,0,f16,f16,0,1 +81,4608,1536,64,0,f16,f16,0,1 +81,7168,1536,64,0,f16,f16,0,1 +81,512,2048,64,0,f16,f16,0,1 +81,1536,2048,64,0,f16,f16,0,1 +81,3072,2048,64,0,f16,f16,0,1 +81,4096,2048,64,0,f16,f16,0,1 +81,4608,2048,64,0,f16,f16,0,1 +81,7168,2048,64,0,f16,f16,0,1 +81,512,2304,64,0,f16,f16,0,1 +81,1536,2304,64,0,f16,f16,0,1 +81,3072,2304,64,0,f16,f16,0,1 +81,4096,2304,64,0,f16,f16,0,1 +81,4608,2304,64,0,f16,f16,0,1 +81,7168,2304,64,0,f16,f16,0,1 +81,512,7168,64,0,f16,f16,0,1 +81,1536,7168,64,0,f16,f16,0,1 +81,3072,7168,64,0,f16,f16,0,1 +81,4096,7168,64,0,f16,f16,0,1 +81,4608,7168,64,0,f16,f16,0,1 +81,7168,7168,64,0,f16,f16,0,1 +82,576,256,64,0,f16,f16,0,1 +82,1536,256,64,0,f16,f16,0,1 +82,3072,256,64,0,f16,f16,0,1 +82,4096,256,64,0,f16,f16,0,1 +82,4608,256,64,0,f16,f16,0,1 +82,7168,256,64,0,f16,f16,0,1 +82,576,512,64,0,f16,f16,0,1 +82,1536,512,64,0,f16,f16,0,1 +82,3072,512,64,0,f16,f16,0,1 +82,4096,512,64,0,f16,f16,0,1 +82,4608,512,64,0,f16,f16,0,1 +82,7168,512,64,0,f16,f16,0,1 +82,576,1536,64,0,f16,f16,0,1 +82,1536,1536,64,0,f16,f16,0,1 +82,3072,1536,64,0,f16,f16,0,1 +82,4096,1536,64,0,f16,f16,0,1 +82,4608,1536,64,0,f16,f16,0,1 +82,7168,1536,64,0,f16,f16,0,1 +82,512,2048,64,0,f16,f16,0,1 +82,1536,2048,64,0,f16,f16,0,1 +82,3072,2048,64,0,f16,f16,0,1 +82,4096,2048,64,0,f16,f16,0,1 +82,4608,2048,64,0,f16,f16,0,1 +82,7168,2048,64,0,f16,f16,0,1 +82,512,2304,64,0,f16,f16,0,1 +82,1536,2304,64,0,f16,f16,0,1 +82,3072,2304,64,0,f16,f16,0,1 +82,4096,2304,64,0,f16,f16,0,1 +82,4608,2304,64,0,f16,f16,0,1 +82,7168,2304,64,0,f16,f16,0,1 +82,512,7168,64,0,f16,f16,0,1 +82,1536,7168,64,0,f16,f16,0,1 +82,3072,7168,64,0,f16,f16,0,1 +82,4096,7168,64,0,f16,f16,0,1 +82,4608,7168,64,0,f16,f16,0,1 +82,7168,7168,64,0,f16,f16,0,1 +83,576,256,64,0,f16,f16,0,1 +83,1536,256,64,0,f16,f16,0,1 +83,3072,256,64,0,f16,f16,0,1 +83,4096,256,64,0,f16,f16,0,1 +83,4608,256,64,0,f16,f16,0,1 +83,7168,256,64,0,f16,f16,0,1 +83,576,512,64,0,f16,f16,0,1 +83,1536,512,64,0,f16,f16,0,1 +83,3072,512,64,0,f16,f16,0,1 +83,4096,512,64,0,f16,f16,0,1 +83,4608,512,64,0,f16,f16,0,1 +83,7168,512,64,0,f16,f16,0,1 +83,576,1536,64,0,f16,f16,0,1 +83,1536,1536,64,0,f16,f16,0,1 +83,3072,1536,64,0,f16,f16,0,1 +83,4096,1536,64,0,f16,f16,0,1 +83,4608,1536,64,0,f16,f16,0,1 +83,7168,1536,64,0,f16,f16,0,1 +83,512,2048,64,0,f16,f16,0,1 +83,1536,2048,64,0,f16,f16,0,1 +83,3072,2048,64,0,f16,f16,0,1 +83,4096,2048,64,0,f16,f16,0,1 +83,4608,2048,64,0,f16,f16,0,1 +83,7168,2048,64,0,f16,f16,0,1 +83,512,2304,64,0,f16,f16,0,1 +83,1536,2304,64,0,f16,f16,0,1 +83,3072,2304,64,0,f16,f16,0,1 +83,4096,2304,64,0,f16,f16,0,1 +83,4608,2304,64,0,f16,f16,0,1 +83,7168,2304,64,0,f16,f16,0,1 +83,512,7168,64,0,f16,f16,0,1 +83,1536,7168,64,0,f16,f16,0,1 +83,3072,7168,64,0,f16,f16,0,1 +83,4096,7168,64,0,f16,f16,0,1 +83,4608,7168,64,0,f16,f16,0,1 +83,7168,7168,64,0,f16,f16,0,1 +84,576,256,64,0,f16,f16,0,1 +84,1536,256,64,0,f16,f16,0,1 +84,3072,256,64,0,f16,f16,0,1 +84,4096,256,64,0,f16,f16,0,1 +84,4608,256,64,0,f16,f16,0,1 +84,7168,256,64,0,f16,f16,0,1 +84,576,512,64,0,f16,f16,0,1 +84,1536,512,64,0,f16,f16,0,1 +84,3072,512,64,0,f16,f16,0,1 +84,4096,512,64,0,f16,f16,0,1 +84,4608,512,64,0,f16,f16,0,1 +84,7168,512,64,0,f16,f16,0,1 +84,576,1536,64,0,f16,f16,0,1 +84,1536,1536,64,0,f16,f16,0,1 +84,3072,1536,64,0,f16,f16,0,1 +84,4096,1536,64,0,f16,f16,0,1 +84,4608,1536,64,0,f16,f16,0,1 +84,7168,1536,64,0,f16,f16,0,1 +84,512,2048,64,0,f16,f16,0,1 +84,1536,2048,64,0,f16,f16,0,1 +84,3072,2048,64,0,f16,f16,0,1 +84,4096,2048,64,0,f16,f16,0,1 +84,4608,2048,64,0,f16,f16,0,1 +84,7168,2048,64,0,f16,f16,0,1 +84,512,2304,64,0,f16,f16,0,1 +84,1536,2304,64,0,f16,f16,0,1 +84,3072,2304,64,0,f16,f16,0,1 +84,4096,2304,64,0,f16,f16,0,1 +84,4608,2304,64,0,f16,f16,0,1 +84,7168,2304,64,0,f16,f16,0,1 +84,512,7168,64,0,f16,f16,0,1 +84,1536,7168,64,0,f16,f16,0,1 +84,3072,7168,64,0,f16,f16,0,1 +84,4096,7168,64,0,f16,f16,0,1 +84,4608,7168,64,0,f16,f16,0,1 +84,7168,7168,64,0,f16,f16,0,1 +85,576,256,64,0,f16,f16,0,1 +85,1536,256,64,0,f16,f16,0,1 +85,3072,256,64,0,f16,f16,0,1 +85,4096,256,64,0,f16,f16,0,1 +85,4608,256,64,0,f16,f16,0,1 +85,7168,256,64,0,f16,f16,0,1 +85,576,512,64,0,f16,f16,0,1 +85,1536,512,64,0,f16,f16,0,1 +85,3072,512,64,0,f16,f16,0,1 +85,4096,512,64,0,f16,f16,0,1 +85,4608,512,64,0,f16,f16,0,1 +85,7168,512,64,0,f16,f16,0,1 +85,576,1536,64,0,f16,f16,0,1 +85,1536,1536,64,0,f16,f16,0,1 +85,3072,1536,64,0,f16,f16,0,1 +85,4096,1536,64,0,f16,f16,0,1 +85,4608,1536,64,0,f16,f16,0,1 +85,7168,1536,64,0,f16,f16,0,1 +85,512,2048,64,0,f16,f16,0,1 +85,1536,2048,64,0,f16,f16,0,1 +85,3072,2048,64,0,f16,f16,0,1 +85,4096,2048,64,0,f16,f16,0,1 +85,4608,2048,64,0,f16,f16,0,1 +85,7168,2048,64,0,f16,f16,0,1 +85,512,2304,64,0,f16,f16,0,1 +85,1536,2304,64,0,f16,f16,0,1 +85,3072,2304,64,0,f16,f16,0,1 +85,4096,2304,64,0,f16,f16,0,1 +85,4608,2304,64,0,f16,f16,0,1 +85,7168,2304,64,0,f16,f16,0,1 +85,512,7168,64,0,f16,f16,0,1 +85,1536,7168,64,0,f16,f16,0,1 +85,3072,7168,64,0,f16,f16,0,1 +85,4096,7168,64,0,f16,f16,0,1 +85,4608,7168,64,0,f16,f16,0,1 +85,7168,7168,64,0,f16,f16,0,1 +86,576,256,64,0,f16,f16,0,1 +86,1536,256,64,0,f16,f16,0,1 +86,3072,256,64,0,f16,f16,0,1 +86,4096,256,64,0,f16,f16,0,1 +86,4608,256,64,0,f16,f16,0,1 +86,7168,256,64,0,f16,f16,0,1 +86,576,512,64,0,f16,f16,0,1 +86,1536,512,64,0,f16,f16,0,1 +86,3072,512,64,0,f16,f16,0,1 +86,4096,512,64,0,f16,f16,0,1 +86,4608,512,64,0,f16,f16,0,1 +86,7168,512,64,0,f16,f16,0,1 +86,576,1536,64,0,f16,f16,0,1 +86,1536,1536,64,0,f16,f16,0,1 +86,3072,1536,64,0,f16,f16,0,1 +86,4096,1536,64,0,f16,f16,0,1 +86,4608,1536,64,0,f16,f16,0,1 +86,7168,1536,64,0,f16,f16,0,1 +86,512,2048,64,0,f16,f16,0,1 +86,1536,2048,64,0,f16,f16,0,1 +86,3072,2048,64,0,f16,f16,0,1 +86,4096,2048,64,0,f16,f16,0,1 +86,4608,2048,64,0,f16,f16,0,1 +86,7168,2048,64,0,f16,f16,0,1 +86,512,2304,64,0,f16,f16,0,1 +86,1536,2304,64,0,f16,f16,0,1 +86,3072,2304,64,0,f16,f16,0,1 +86,4096,2304,64,0,f16,f16,0,1 +86,4608,2304,64,0,f16,f16,0,1 +86,7168,2304,64,0,f16,f16,0,1 +86,512,7168,64,0,f16,f16,0,1 +86,1536,7168,64,0,f16,f16,0,1 +86,3072,7168,64,0,f16,f16,0,1 +86,4096,7168,64,0,f16,f16,0,1 +86,4608,7168,64,0,f16,f16,0,1 +86,7168,7168,64,0,f16,f16,0,1 +87,576,256,64,0,f16,f16,0,1 +87,1536,256,64,0,f16,f16,0,1 +87,3072,256,64,0,f16,f16,0,1 +87,4096,256,64,0,f16,f16,0,1 +87,4608,256,64,0,f16,f16,0,1 +87,7168,256,64,0,f16,f16,0,1 +87,576,512,64,0,f16,f16,0,1 +87,1536,512,64,0,f16,f16,0,1 +87,3072,512,64,0,f16,f16,0,1 +87,4096,512,64,0,f16,f16,0,1 +87,4608,512,64,0,f16,f16,0,1 +87,7168,512,64,0,f16,f16,0,1 +87,576,1536,64,0,f16,f16,0,1 +87,1536,1536,64,0,f16,f16,0,1 +87,3072,1536,64,0,f16,f16,0,1 +87,4096,1536,64,0,f16,f16,0,1 +87,4608,1536,64,0,f16,f16,0,1 +87,7168,1536,64,0,f16,f16,0,1 +87,512,2048,64,0,f16,f16,0,1 +87,1536,2048,64,0,f16,f16,0,1 +87,3072,2048,64,0,f16,f16,0,1 +87,4096,2048,64,0,f16,f16,0,1 +87,4608,2048,64,0,f16,f16,0,1 +87,7168,2048,64,0,f16,f16,0,1 +87,512,2304,64,0,f16,f16,0,1 +87,1536,2304,64,0,f16,f16,0,1 +87,3072,2304,64,0,f16,f16,0,1 +87,4096,2304,64,0,f16,f16,0,1 +87,4608,2304,64,0,f16,f16,0,1 +87,7168,2304,64,0,f16,f16,0,1 +87,512,7168,64,0,f16,f16,0,1 +87,1536,7168,64,0,f16,f16,0,1 +87,3072,7168,64,0,f16,f16,0,1 +87,4096,7168,64,0,f16,f16,0,1 +87,4608,7168,64,0,f16,f16,0,1 +87,7168,7168,64,0,f16,f16,0,1 +88,576,256,64,0,f16,f16,0,1 +88,1536,256,64,0,f16,f16,0,1 +88,3072,256,64,0,f16,f16,0,1 +88,4096,256,64,0,f16,f16,0,1 +88,4608,256,64,0,f16,f16,0,1 +88,7168,256,64,0,f16,f16,0,1 +88,576,512,64,0,f16,f16,0,1 +88,1536,512,64,0,f16,f16,0,1 +88,3072,512,64,0,f16,f16,0,1 +88,4096,512,64,0,f16,f16,0,1 +88,4608,512,64,0,f16,f16,0,1 +88,7168,512,64,0,f16,f16,0,1 +88,576,1536,64,0,f16,f16,0,1 +88,1536,1536,64,0,f16,f16,0,1 +88,3072,1536,64,0,f16,f16,0,1 +88,4096,1536,64,0,f16,f16,0,1 +88,4608,1536,64,0,f16,f16,0,1 +88,7168,1536,64,0,f16,f16,0,1 +88,512,2048,64,0,f16,f16,0,1 +88,1536,2048,64,0,f16,f16,0,1 +88,3072,2048,64,0,f16,f16,0,1 +88,4096,2048,64,0,f16,f16,0,1 +88,4608,2048,64,0,f16,f16,0,1 +88,7168,2048,64,0,f16,f16,0,1 +88,512,2304,64,0,f16,f16,0,1 +88,1536,2304,64,0,f16,f16,0,1 +88,3072,2304,64,0,f16,f16,0,1 +88,4096,2304,64,0,f16,f16,0,1 +88,4608,2304,64,0,f16,f16,0,1 +88,7168,2304,64,0,f16,f16,0,1 +88,512,7168,64,0,f16,f16,0,1 +88,1536,7168,64,0,f16,f16,0,1 +88,3072,7168,64,0,f16,f16,0,1 +88,4096,7168,64,0,f16,f16,0,1 +88,4608,7168,64,0,f16,f16,0,1 +88,7168,7168,64,0,f16,f16,0,1 +89,576,256,64,0,f16,f16,0,1 +89,1536,256,64,0,f16,f16,0,1 +89,3072,256,64,0,f16,f16,0,1 +89,4096,256,64,0,f16,f16,0,1 +89,4608,256,64,0,f16,f16,0,1 +89,7168,256,64,0,f16,f16,0,1 +89,576,512,64,0,f16,f16,0,1 +89,1536,512,64,0,f16,f16,0,1 +89,3072,512,64,0,f16,f16,0,1 +89,4096,512,64,0,f16,f16,0,1 +89,4608,512,64,0,f16,f16,0,1 +89,7168,512,64,0,f16,f16,0,1 +89,576,1536,64,0,f16,f16,0,1 +89,1536,1536,64,0,f16,f16,0,1 +89,3072,1536,64,0,f16,f16,0,1 +89,4096,1536,64,0,f16,f16,0,1 +89,4608,1536,64,0,f16,f16,0,1 +89,7168,1536,64,0,f16,f16,0,1 +89,512,2048,64,0,f16,f16,0,1 +89,1536,2048,64,0,f16,f16,0,1 +89,3072,2048,64,0,f16,f16,0,1 +89,4096,2048,64,0,f16,f16,0,1 +89,4608,2048,64,0,f16,f16,0,1 +89,7168,2048,64,0,f16,f16,0,1 +89,512,2304,64,0,f16,f16,0,1 +89,1536,2304,64,0,f16,f16,0,1 +89,3072,2304,64,0,f16,f16,0,1 +89,4096,2304,64,0,f16,f16,0,1 +89,4608,2304,64,0,f16,f16,0,1 +89,7168,2304,64,0,f16,f16,0,1 +89,512,7168,64,0,f16,f16,0,1 +89,1536,7168,64,0,f16,f16,0,1 +89,3072,7168,64,0,f16,f16,0,1 +89,4096,7168,64,0,f16,f16,0,1 +89,4608,7168,64,0,f16,f16,0,1 +89,7168,7168,64,0,f16,f16,0,1 +90,576,256,64,0,f16,f16,0,1 +90,1536,256,64,0,f16,f16,0,1 +90,3072,256,64,0,f16,f16,0,1 +90,4096,256,64,0,f16,f16,0,1 +90,4608,256,64,0,f16,f16,0,1 +90,7168,256,64,0,f16,f16,0,1 +90,576,512,64,0,f16,f16,0,1 +90,1536,512,64,0,f16,f16,0,1 +90,3072,512,64,0,f16,f16,0,1 +90,4096,512,64,0,f16,f16,0,1 +90,4608,512,64,0,f16,f16,0,1 +90,7168,512,64,0,f16,f16,0,1 +90,576,1536,64,0,f16,f16,0,1 +90,1536,1536,64,0,f16,f16,0,1 +90,3072,1536,64,0,f16,f16,0,1 +90,4096,1536,64,0,f16,f16,0,1 +90,4608,1536,64,0,f16,f16,0,1 +90,7168,1536,64,0,f16,f16,0,1 +90,512,2048,64,0,f16,f16,0,1 +90,1536,2048,64,0,f16,f16,0,1 +90,3072,2048,64,0,f16,f16,0,1 +90,4096,2048,64,0,f16,f16,0,1 +90,4608,2048,64,0,f16,f16,0,1 +90,7168,2048,64,0,f16,f16,0,1 +90,512,2304,64,0,f16,f16,0,1 +90,1536,2304,64,0,f16,f16,0,1 +90,3072,2304,64,0,f16,f16,0,1 +90,4096,2304,64,0,f16,f16,0,1 +90,4608,2304,64,0,f16,f16,0,1 +90,7168,2304,64,0,f16,f16,0,1 +90,512,7168,64,0,f16,f16,0,1 +90,1536,7168,64,0,f16,f16,0,1 +90,3072,7168,64,0,f16,f16,0,1 +90,4096,7168,64,0,f16,f16,0,1 +90,4608,7168,64,0,f16,f16,0,1 +90,7168,7168,64,0,f16,f16,0,1 +91,576,256,64,0,f16,f16,0,1 +91,1536,256,64,0,f16,f16,0,1 +91,3072,256,64,0,f16,f16,0,1 +91,4096,256,64,0,f16,f16,0,1 +91,4608,256,64,0,f16,f16,0,1 +91,7168,256,64,0,f16,f16,0,1 +91,576,512,64,0,f16,f16,0,1 +91,1536,512,64,0,f16,f16,0,1 +91,3072,512,64,0,f16,f16,0,1 +91,4096,512,64,0,f16,f16,0,1 +91,4608,512,64,0,f16,f16,0,1 +91,7168,512,64,0,f16,f16,0,1 +91,576,1536,64,0,f16,f16,0,1 +91,1536,1536,64,0,f16,f16,0,1 +91,3072,1536,64,0,f16,f16,0,1 +91,4096,1536,64,0,f16,f16,0,1 +91,4608,1536,64,0,f16,f16,0,1 +91,7168,1536,64,0,f16,f16,0,1 +91,512,2048,64,0,f16,f16,0,1 +91,1536,2048,64,0,f16,f16,0,1 +91,3072,2048,64,0,f16,f16,0,1 +91,4096,2048,64,0,f16,f16,0,1 +91,4608,2048,64,0,f16,f16,0,1 +91,7168,2048,64,0,f16,f16,0,1 +91,512,2304,64,0,f16,f16,0,1 +91,1536,2304,64,0,f16,f16,0,1 +91,3072,2304,64,0,f16,f16,0,1 +91,4096,2304,64,0,f16,f16,0,1 +91,4608,2304,64,0,f16,f16,0,1 +91,7168,2304,64,0,f16,f16,0,1 +91,512,7168,64,0,f16,f16,0,1 +91,1536,7168,64,0,f16,f16,0,1 +91,3072,7168,64,0,f16,f16,0,1 +91,4096,7168,64,0,f16,f16,0,1 +91,4608,7168,64,0,f16,f16,0,1 +91,7168,7168,64,0,f16,f16,0,1 +92,576,256,64,0,f16,f16,0,1 +92,1536,256,64,0,f16,f16,0,1 +92,3072,256,64,0,f16,f16,0,1 +92,4096,256,64,0,f16,f16,0,1 +92,4608,256,64,0,f16,f16,0,1 +92,7168,256,64,0,f16,f16,0,1 +92,576,512,64,0,f16,f16,0,1 +92,1536,512,64,0,f16,f16,0,1 +92,3072,512,64,0,f16,f16,0,1 +92,4096,512,64,0,f16,f16,0,1 +92,4608,512,64,0,f16,f16,0,1 +92,7168,512,64,0,f16,f16,0,1 +92,576,1536,64,0,f16,f16,0,1 +92,1536,1536,64,0,f16,f16,0,1 +92,3072,1536,64,0,f16,f16,0,1 +92,4096,1536,64,0,f16,f16,0,1 +92,4608,1536,64,0,f16,f16,0,1 +92,7168,1536,64,0,f16,f16,0,1 +92,512,2048,64,0,f16,f16,0,1 +92,1536,2048,64,0,f16,f16,0,1 +92,3072,2048,64,0,f16,f16,0,1 +92,4096,2048,64,0,f16,f16,0,1 +92,4608,2048,64,0,f16,f16,0,1 +92,7168,2048,64,0,f16,f16,0,1 +92,512,2304,64,0,f16,f16,0,1 +92,1536,2304,64,0,f16,f16,0,1 +92,3072,2304,64,0,f16,f16,0,1 +92,4096,2304,64,0,f16,f16,0,1 +92,4608,2304,64,0,f16,f16,0,1 +92,7168,2304,64,0,f16,f16,0,1 +92,512,7168,64,0,f16,f16,0,1 +92,1536,7168,64,0,f16,f16,0,1 +92,3072,7168,64,0,f16,f16,0,1 +92,4096,7168,64,0,f16,f16,0,1 +92,4608,7168,64,0,f16,f16,0,1 +92,7168,7168,64,0,f16,f16,0,1 +93,576,256,64,0,f16,f16,0,1 +93,1536,256,64,0,f16,f16,0,1 +93,3072,256,64,0,f16,f16,0,1 +93,4096,256,64,0,f16,f16,0,1 +93,4608,256,64,0,f16,f16,0,1 +93,7168,256,64,0,f16,f16,0,1 +93,576,512,64,0,f16,f16,0,1 +93,1536,512,64,0,f16,f16,0,1 +93,3072,512,64,0,f16,f16,0,1 +93,4096,512,64,0,f16,f16,0,1 +93,4608,512,64,0,f16,f16,0,1 +93,7168,512,64,0,f16,f16,0,1 +93,576,1536,64,0,f16,f16,0,1 +93,1536,1536,64,0,f16,f16,0,1 +93,3072,1536,64,0,f16,f16,0,1 +93,4096,1536,64,0,f16,f16,0,1 +93,4608,1536,64,0,f16,f16,0,1 +93,7168,1536,64,0,f16,f16,0,1 +93,512,2048,64,0,f16,f16,0,1 +93,1536,2048,64,0,f16,f16,0,1 +93,3072,2048,64,0,f16,f16,0,1 +93,4096,2048,64,0,f16,f16,0,1 +93,4608,2048,64,0,f16,f16,0,1 +93,7168,2048,64,0,f16,f16,0,1 +93,512,2304,64,0,f16,f16,0,1 +93,1536,2304,64,0,f16,f16,0,1 +93,3072,2304,64,0,f16,f16,0,1 +93,4096,2304,64,0,f16,f16,0,1 +93,4608,2304,64,0,f16,f16,0,1 +93,7168,2304,64,0,f16,f16,0,1 +93,512,7168,64,0,f16,f16,0,1 +93,1536,7168,64,0,f16,f16,0,1 +93,3072,7168,64,0,f16,f16,0,1 +93,4096,7168,64,0,f16,f16,0,1 +93,4608,7168,64,0,f16,f16,0,1 +93,7168,7168,64,0,f16,f16,0,1 +94,576,256,64,0,f16,f16,0,1 +94,1536,256,64,0,f16,f16,0,1 +94,3072,256,64,0,f16,f16,0,1 +94,4096,256,64,0,f16,f16,0,1 +94,4608,256,64,0,f16,f16,0,1 +94,7168,256,64,0,f16,f16,0,1 +94,576,512,64,0,f16,f16,0,1 +94,1536,512,64,0,f16,f16,0,1 +94,3072,512,64,0,f16,f16,0,1 +94,4096,512,64,0,f16,f16,0,1 +94,4608,512,64,0,f16,f16,0,1 +94,7168,512,64,0,f16,f16,0,1 +94,576,1536,64,0,f16,f16,0,1 +94,1536,1536,64,0,f16,f16,0,1 +94,3072,1536,64,0,f16,f16,0,1 +94,4096,1536,64,0,f16,f16,0,1 +94,4608,1536,64,0,f16,f16,0,1 +94,7168,1536,64,0,f16,f16,0,1 +94,512,2048,64,0,f16,f16,0,1 +94,1536,2048,64,0,f16,f16,0,1 +94,3072,2048,64,0,f16,f16,0,1 +94,4096,2048,64,0,f16,f16,0,1 +94,4608,2048,64,0,f16,f16,0,1 +94,7168,2048,64,0,f16,f16,0,1 +94,512,2304,64,0,f16,f16,0,1 +94,1536,2304,64,0,f16,f16,0,1 +94,3072,2304,64,0,f16,f16,0,1 +94,4096,2304,64,0,f16,f16,0,1 +94,4608,2304,64,0,f16,f16,0,1 +94,7168,2304,64,0,f16,f16,0,1 +94,512,7168,64,0,f16,f16,0,1 +94,1536,7168,64,0,f16,f16,0,1 +94,3072,7168,64,0,f16,f16,0,1 +94,4096,7168,64,0,f16,f16,0,1 +94,4608,7168,64,0,f16,f16,0,1 +94,7168,7168,64,0,f16,f16,0,1 +95,576,256,64,0,f16,f16,0,1 +95,1536,256,64,0,f16,f16,0,1 +95,3072,256,64,0,f16,f16,0,1 +95,4096,256,64,0,f16,f16,0,1 +95,4608,256,64,0,f16,f16,0,1 +95,7168,256,64,0,f16,f16,0,1 +95,576,512,64,0,f16,f16,0,1 +95,1536,512,64,0,f16,f16,0,1 +95,3072,512,64,0,f16,f16,0,1 +95,4096,512,64,0,f16,f16,0,1 +95,4608,512,64,0,f16,f16,0,1 +95,7168,512,64,0,f16,f16,0,1 +95,576,1536,64,0,f16,f16,0,1 +95,1536,1536,64,0,f16,f16,0,1 +95,3072,1536,64,0,f16,f16,0,1 +95,4096,1536,64,0,f16,f16,0,1 +95,4608,1536,64,0,f16,f16,0,1 +95,7168,1536,64,0,f16,f16,0,1 +95,512,2048,64,0,f16,f16,0,1 +95,1536,2048,64,0,f16,f16,0,1 +95,3072,2048,64,0,f16,f16,0,1 +95,4096,2048,64,0,f16,f16,0,1 +95,4608,2048,64,0,f16,f16,0,1 +95,7168,2048,64,0,f16,f16,0,1 +95,512,2304,64,0,f16,f16,0,1 +95,1536,2304,64,0,f16,f16,0,1 +95,3072,2304,64,0,f16,f16,0,1 +95,4096,2304,64,0,f16,f16,0,1 +95,4608,2304,64,0,f16,f16,0,1 +95,7168,2304,64,0,f16,f16,0,1 +95,512,7168,64,0,f16,f16,0,1 +95,1536,7168,64,0,f16,f16,0,1 +95,3072,7168,64,0,f16,f16,0,1 +95,4096,7168,64,0,f16,f16,0,1 +95,4608,7168,64,0,f16,f16,0,1 +95,7168,7168,64,0,f16,f16,0,1 +96,576,256,64,0,f16,f16,0,1 +96,1536,256,64,0,f16,f16,0,1 +96,3072,256,64,0,f16,f16,0,1 +96,4096,256,64,0,f16,f16,0,1 +96,4608,256,64,0,f16,f16,0,1 +96,7168,256,64,0,f16,f16,0,1 +96,576,512,64,0,f16,f16,0,1 +96,1536,512,64,0,f16,f16,0,1 +96,3072,512,64,0,f16,f16,0,1 +96,4096,512,64,0,f16,f16,0,1 +96,4608,512,64,0,f16,f16,0,1 +96,7168,512,64,0,f16,f16,0,1 +96,576,1536,64,0,f16,f16,0,1 +96,1536,1536,64,0,f16,f16,0,1 +96,3072,1536,64,0,f16,f16,0,1 +96,4096,1536,64,0,f16,f16,0,1 +96,4608,1536,64,0,f16,f16,0,1 +96,7168,1536,64,0,f16,f16,0,1 +96,512,2048,64,0,f16,f16,0,1 +96,1536,2048,64,0,f16,f16,0,1 +96,3072,2048,64,0,f16,f16,0,1 +96,4096,2048,64,0,f16,f16,0,1 +96,4608,2048,64,0,f16,f16,0,1 +96,7168,2048,64,0,f16,f16,0,1 +96,512,2304,64,0,f16,f16,0,1 +96,1536,2304,64,0,f16,f16,0,1 +96,3072,2304,64,0,f16,f16,0,1 +96,4096,2304,64,0,f16,f16,0,1 +96,4608,2304,64,0,f16,f16,0,1 +96,7168,2304,64,0,f16,f16,0,1 +96,512,7168,64,0,f16,f16,0,1 +96,1536,7168,64,0,f16,f16,0,1 +96,3072,7168,64,0,f16,f16,0,1 +96,4096,7168,64,0,f16,f16,0,1 +96,4608,7168,64,0,f16,f16,0,1 +96,7168,7168,64,0,f16,f16,0,1 +97,576,256,64,0,f16,f16,0,1 +97,1536,256,64,0,f16,f16,0,1 +97,3072,256,64,0,f16,f16,0,1 +97,4096,256,64,0,f16,f16,0,1 +97,4608,256,64,0,f16,f16,0,1 +97,7168,256,64,0,f16,f16,0,1 +97,576,512,64,0,f16,f16,0,1 +97,1536,512,64,0,f16,f16,0,1 +97,3072,512,64,0,f16,f16,0,1 +97,4096,512,64,0,f16,f16,0,1 +97,4608,512,64,0,f16,f16,0,1 +97,7168,512,64,0,f16,f16,0,1 +97,576,1536,64,0,f16,f16,0,1 +97,1536,1536,64,0,f16,f16,0,1 +97,3072,1536,64,0,f16,f16,0,1 +97,4096,1536,64,0,f16,f16,0,1 +97,4608,1536,64,0,f16,f16,0,1 +97,7168,1536,64,0,f16,f16,0,1 +97,512,2048,64,0,f16,f16,0,1 +97,1536,2048,64,0,f16,f16,0,1 +97,3072,2048,64,0,f16,f16,0,1 +97,4096,2048,64,0,f16,f16,0,1 +97,4608,2048,64,0,f16,f16,0,1 +97,7168,2048,64,0,f16,f16,0,1 +97,512,2304,64,0,f16,f16,0,1 +97,1536,2304,64,0,f16,f16,0,1 +97,3072,2304,64,0,f16,f16,0,1 +97,4096,2304,64,0,f16,f16,0,1 +97,4608,2304,64,0,f16,f16,0,1 +97,7168,2304,64,0,f16,f16,0,1 +97,512,7168,64,0,f16,f16,0,1 +97,1536,7168,64,0,f16,f16,0,1 +97,3072,7168,64,0,f16,f16,0,1 +97,4096,7168,64,0,f16,f16,0,1 +97,4608,7168,64,0,f16,f16,0,1 +97,7168,7168,64,0,f16,f16,0,1 +98,576,256,64,0,f16,f16,0,1 +98,1536,256,64,0,f16,f16,0,1 +98,3072,256,64,0,f16,f16,0,1 +98,4096,256,64,0,f16,f16,0,1 +98,4608,256,64,0,f16,f16,0,1 +98,7168,256,64,0,f16,f16,0,1 +98,576,512,64,0,f16,f16,0,1 +98,1536,512,64,0,f16,f16,0,1 +98,3072,512,64,0,f16,f16,0,1 +98,4096,512,64,0,f16,f16,0,1 +98,4608,512,64,0,f16,f16,0,1 +98,7168,512,64,0,f16,f16,0,1 +98,576,1536,64,0,f16,f16,0,1 +98,1536,1536,64,0,f16,f16,0,1 +98,3072,1536,64,0,f16,f16,0,1 +98,4096,1536,64,0,f16,f16,0,1 +98,4608,1536,64,0,f16,f16,0,1 +98,7168,1536,64,0,f16,f16,0,1 +98,512,2048,64,0,f16,f16,0,1 +98,1536,2048,64,0,f16,f16,0,1 +98,3072,2048,64,0,f16,f16,0,1 +98,4096,2048,64,0,f16,f16,0,1 +98,4608,2048,64,0,f16,f16,0,1 +98,7168,2048,64,0,f16,f16,0,1 +98,512,2304,64,0,f16,f16,0,1 +98,1536,2304,64,0,f16,f16,0,1 +98,3072,2304,64,0,f16,f16,0,1 +98,4096,2304,64,0,f16,f16,0,1 +98,4608,2304,64,0,f16,f16,0,1 +98,7168,2304,64,0,f16,f16,0,1 +98,512,7168,64,0,f16,f16,0,1 +98,1536,7168,64,0,f16,f16,0,1 +98,3072,7168,64,0,f16,f16,0,1 +98,4096,7168,64,0,f16,f16,0,1 +98,4608,7168,64,0,f16,f16,0,1 +98,7168,7168,64,0,f16,f16,0,1 +99,576,256,64,0,f16,f16,0,1 +99,1536,256,64,0,f16,f16,0,1 +99,3072,256,64,0,f16,f16,0,1 +99,4096,256,64,0,f16,f16,0,1 +99,4608,256,64,0,f16,f16,0,1 +99,7168,256,64,0,f16,f16,0,1 +99,576,512,64,0,f16,f16,0,1 +99,1536,512,64,0,f16,f16,0,1 +99,3072,512,64,0,f16,f16,0,1 +99,4096,512,64,0,f16,f16,0,1 +99,4608,512,64,0,f16,f16,0,1 +99,7168,512,64,0,f16,f16,0,1 +99,576,1536,64,0,f16,f16,0,1 +99,1536,1536,64,0,f16,f16,0,1 +99,3072,1536,64,0,f16,f16,0,1 +99,4096,1536,64,0,f16,f16,0,1 +99,4608,1536,64,0,f16,f16,0,1 +99,7168,1536,64,0,f16,f16,0,1 +99,512,2048,64,0,f16,f16,0,1 +99,1536,2048,64,0,f16,f16,0,1 +99,3072,2048,64,0,f16,f16,0,1 +99,4096,2048,64,0,f16,f16,0,1 +99,4608,2048,64,0,f16,f16,0,1 +99,7168,2048,64,0,f16,f16,0,1 +99,512,2304,64,0,f16,f16,0,1 +99,1536,2304,64,0,f16,f16,0,1 +99,3072,2304,64,0,f16,f16,0,1 +99,4096,2304,64,0,f16,f16,0,1 +99,4608,2304,64,0,f16,f16,0,1 +99,7168,2304,64,0,f16,f16,0,1 +99,512,7168,64,0,f16,f16,0,1 +99,1536,7168,64,0,f16,f16,0,1 +99,3072,7168,64,0,f16,f16,0,1 +99,4096,7168,64,0,f16,f16,0,1 +99,4608,7168,64,0,f16,f16,0,1 +99,7168,7168,64,0,f16,f16,0,1 +100,576,256,64,0,f16,f16,0,1 +100,1536,256,64,0,f16,f16,0,1 +100,3072,256,64,0,f16,f16,0,1 +100,4096,256,64,0,f16,f16,0,1 +100,4608,256,64,0,f16,f16,0,1 +100,7168,256,64,0,f16,f16,0,1 +100,576,512,64,0,f16,f16,0,1 +100,1536,512,64,0,f16,f16,0,1 +100,3072,512,64,0,f16,f16,0,1 +100,4096,512,64,0,f16,f16,0,1 +100,4608,512,64,0,f16,f16,0,1 +100,7168,512,64,0,f16,f16,0,1 +100,576,1536,64,0,f16,f16,0,1 +100,1536,1536,64,0,f16,f16,0,1 +100,3072,1536,64,0,f16,f16,0,1 +100,4096,1536,64,0,f16,f16,0,1 +100,4608,1536,64,0,f16,f16,0,1 +100,7168,1536,64,0,f16,f16,0,1 +100,512,2048,64,0,f16,f16,0,1 +100,1536,2048,64,0,f16,f16,0,1 +100,3072,2048,64,0,f16,f16,0,1 +100,4096,2048,64,0,f16,f16,0,1 +100,4608,2048,64,0,f16,f16,0,1 +100,7168,2048,64,0,f16,f16,0,1 +100,512,2304,64,0,f16,f16,0,1 +100,1536,2304,64,0,f16,f16,0,1 +100,3072,2304,64,0,f16,f16,0,1 +100,4096,2304,64,0,f16,f16,0,1 +100,4608,2304,64,0,f16,f16,0,1 +100,7168,2304,64,0,f16,f16,0,1 +100,512,7168,64,0,f16,f16,0,1 +100,1536,7168,64,0,f16,f16,0,1 +100,3072,7168,64,0,f16,f16,0,1 +100,4096,7168,64,0,f16,f16,0,1 +100,4608,7168,64,0,f16,f16,0,1 +100,7168,7168,64,0,f16,f16,0,1 +101,576,256,64,0,f16,f16,0,1 +101,1536,256,64,0,f16,f16,0,1 +101,3072,256,64,0,f16,f16,0,1 +101,4096,256,64,0,f16,f16,0,1 +101,4608,256,64,0,f16,f16,0,1 +101,7168,256,64,0,f16,f16,0,1 +101,576,512,64,0,f16,f16,0,1 +101,1536,512,64,0,f16,f16,0,1 +101,3072,512,64,0,f16,f16,0,1 +101,4096,512,64,0,f16,f16,0,1 +101,4608,512,64,0,f16,f16,0,1 +101,7168,512,64,0,f16,f16,0,1 +101,576,1536,64,0,f16,f16,0,1 +101,1536,1536,64,0,f16,f16,0,1 +101,3072,1536,64,0,f16,f16,0,1 +101,4096,1536,64,0,f16,f16,0,1 +101,4608,1536,64,0,f16,f16,0,1 +101,7168,1536,64,0,f16,f16,0,1 +101,512,2048,64,0,f16,f16,0,1 +101,1536,2048,64,0,f16,f16,0,1 +101,3072,2048,64,0,f16,f16,0,1 +101,4096,2048,64,0,f16,f16,0,1 +101,4608,2048,64,0,f16,f16,0,1 +101,7168,2048,64,0,f16,f16,0,1 +101,512,2304,64,0,f16,f16,0,1 +101,1536,2304,64,0,f16,f16,0,1 +101,3072,2304,64,0,f16,f16,0,1 +101,4096,2304,64,0,f16,f16,0,1 +101,4608,2304,64,0,f16,f16,0,1 +101,7168,2304,64,0,f16,f16,0,1 +101,512,7168,64,0,f16,f16,0,1 +101,1536,7168,64,0,f16,f16,0,1 +101,3072,7168,64,0,f16,f16,0,1 +101,4096,7168,64,0,f16,f16,0,1 +101,4608,7168,64,0,f16,f16,0,1 +101,7168,7168,64,0,f16,f16,0,1 +102,576,256,64,0,f16,f16,0,1 +102,1536,256,64,0,f16,f16,0,1 +102,3072,256,64,0,f16,f16,0,1 +102,4096,256,64,0,f16,f16,0,1 +102,4608,256,64,0,f16,f16,0,1 +102,7168,256,64,0,f16,f16,0,1 +102,576,512,64,0,f16,f16,0,1 +102,1536,512,64,0,f16,f16,0,1 +102,3072,512,64,0,f16,f16,0,1 +102,4096,512,64,0,f16,f16,0,1 +102,4608,512,64,0,f16,f16,0,1 +102,7168,512,64,0,f16,f16,0,1 +102,576,1536,64,0,f16,f16,0,1 +102,1536,1536,64,0,f16,f16,0,1 +102,3072,1536,64,0,f16,f16,0,1 +102,4096,1536,64,0,f16,f16,0,1 +102,4608,1536,64,0,f16,f16,0,1 +102,7168,1536,64,0,f16,f16,0,1 +102,512,2048,64,0,f16,f16,0,1 +102,1536,2048,64,0,f16,f16,0,1 +102,3072,2048,64,0,f16,f16,0,1 +102,4096,2048,64,0,f16,f16,0,1 +102,4608,2048,64,0,f16,f16,0,1 +102,7168,2048,64,0,f16,f16,0,1 +102,512,2304,64,0,f16,f16,0,1 +102,1536,2304,64,0,f16,f16,0,1 +102,3072,2304,64,0,f16,f16,0,1 +102,4096,2304,64,0,f16,f16,0,1 +102,4608,2304,64,0,f16,f16,0,1 +102,7168,2304,64,0,f16,f16,0,1 +102,512,7168,64,0,f16,f16,0,1 +102,1536,7168,64,0,f16,f16,0,1 +102,3072,7168,64,0,f16,f16,0,1 +102,4096,7168,64,0,f16,f16,0,1 +102,4608,7168,64,0,f16,f16,0,1 +102,7168,7168,64,0,f16,f16,0,1 +103,576,256,64,0,f16,f16,0,1 +103,1536,256,64,0,f16,f16,0,1 +103,3072,256,64,0,f16,f16,0,1 +103,4096,256,64,0,f16,f16,0,1 +103,4608,256,64,0,f16,f16,0,1 +103,7168,256,64,0,f16,f16,0,1 +103,576,512,64,0,f16,f16,0,1 +103,1536,512,64,0,f16,f16,0,1 +103,3072,512,64,0,f16,f16,0,1 +103,4096,512,64,0,f16,f16,0,1 +103,4608,512,64,0,f16,f16,0,1 +103,7168,512,64,0,f16,f16,0,1 +103,576,1536,64,0,f16,f16,0,1 +103,1536,1536,64,0,f16,f16,0,1 +103,3072,1536,64,0,f16,f16,0,1 +103,4096,1536,64,0,f16,f16,0,1 +103,4608,1536,64,0,f16,f16,0,1 +103,7168,1536,64,0,f16,f16,0,1 +103,512,2048,64,0,f16,f16,0,1 +103,1536,2048,64,0,f16,f16,0,1 +103,3072,2048,64,0,f16,f16,0,1 +103,4096,2048,64,0,f16,f16,0,1 +103,4608,2048,64,0,f16,f16,0,1 +103,7168,2048,64,0,f16,f16,0,1 +103,512,2304,64,0,f16,f16,0,1 +103,1536,2304,64,0,f16,f16,0,1 +103,3072,2304,64,0,f16,f16,0,1 +103,4096,2304,64,0,f16,f16,0,1 +103,4608,2304,64,0,f16,f16,0,1 +103,7168,2304,64,0,f16,f16,0,1 +103,512,7168,64,0,f16,f16,0,1 +103,1536,7168,64,0,f16,f16,0,1 +103,3072,7168,64,0,f16,f16,0,1 +103,4096,7168,64,0,f16,f16,0,1 +103,4608,7168,64,0,f16,f16,0,1 +103,7168,7168,64,0,f16,f16,0,1 +104,576,256,64,0,f16,f16,0,1 +104,1536,256,64,0,f16,f16,0,1 +104,3072,256,64,0,f16,f16,0,1 +104,4096,256,64,0,f16,f16,0,1 +104,4608,256,64,0,f16,f16,0,1 +104,7168,256,64,0,f16,f16,0,1 +104,576,512,64,0,f16,f16,0,1 +104,1536,512,64,0,f16,f16,0,1 +104,3072,512,64,0,f16,f16,0,1 +104,4096,512,64,0,f16,f16,0,1 +104,4608,512,64,0,f16,f16,0,1 +104,7168,512,64,0,f16,f16,0,1 +104,576,1536,64,0,f16,f16,0,1 +104,1536,1536,64,0,f16,f16,0,1 +104,3072,1536,64,0,f16,f16,0,1 +104,4096,1536,64,0,f16,f16,0,1 +104,4608,1536,64,0,f16,f16,0,1 +104,7168,1536,64,0,f16,f16,0,1 +104,512,2048,64,0,f16,f16,0,1 +104,1536,2048,64,0,f16,f16,0,1 +104,3072,2048,64,0,f16,f16,0,1 +104,4096,2048,64,0,f16,f16,0,1 +104,4608,2048,64,0,f16,f16,0,1 +104,7168,2048,64,0,f16,f16,0,1 +104,512,2304,64,0,f16,f16,0,1 +104,1536,2304,64,0,f16,f16,0,1 +104,3072,2304,64,0,f16,f16,0,1 +104,4096,2304,64,0,f16,f16,0,1 +104,4608,2304,64,0,f16,f16,0,1 +104,7168,2304,64,0,f16,f16,0,1 +104,512,7168,64,0,f16,f16,0,1 +104,1536,7168,64,0,f16,f16,0,1 +104,3072,7168,64,0,f16,f16,0,1 +104,4096,7168,64,0,f16,f16,0,1 +104,4608,7168,64,0,f16,f16,0,1 +104,7168,7168,64,0,f16,f16,0,1 +105,576,256,64,0,f16,f16,0,1 +105,1536,256,64,0,f16,f16,0,1 +105,3072,256,64,0,f16,f16,0,1 +105,4096,256,64,0,f16,f16,0,1 +105,4608,256,64,0,f16,f16,0,1 +105,7168,256,64,0,f16,f16,0,1 +105,576,512,64,0,f16,f16,0,1 +105,1536,512,64,0,f16,f16,0,1 +105,3072,512,64,0,f16,f16,0,1 +105,4096,512,64,0,f16,f16,0,1 +105,4608,512,64,0,f16,f16,0,1 +105,7168,512,64,0,f16,f16,0,1 +105,576,1536,64,0,f16,f16,0,1 +105,1536,1536,64,0,f16,f16,0,1 +105,3072,1536,64,0,f16,f16,0,1 +105,4096,1536,64,0,f16,f16,0,1 +105,4608,1536,64,0,f16,f16,0,1 +105,7168,1536,64,0,f16,f16,0,1 +105,512,2048,64,0,f16,f16,0,1 +105,1536,2048,64,0,f16,f16,0,1 +105,3072,2048,64,0,f16,f16,0,1 +105,4096,2048,64,0,f16,f16,0,1 +105,4608,2048,64,0,f16,f16,0,1 +105,7168,2048,64,0,f16,f16,0,1 +105,512,2304,64,0,f16,f16,0,1 +105,1536,2304,64,0,f16,f16,0,1 +105,3072,2304,64,0,f16,f16,0,1 +105,4096,2304,64,0,f16,f16,0,1 +105,4608,2304,64,0,f16,f16,0,1 +105,7168,2304,64,0,f16,f16,0,1 +105,512,7168,64,0,f16,f16,0,1 +105,1536,7168,64,0,f16,f16,0,1 +105,3072,7168,64,0,f16,f16,0,1 +105,4096,7168,64,0,f16,f16,0,1 +105,4608,7168,64,0,f16,f16,0,1 +105,7168,7168,64,0,f16,f16,0,1 +106,576,256,64,0,f16,f16,0,1 +106,1536,256,64,0,f16,f16,0,1 +106,3072,256,64,0,f16,f16,0,1 +106,4096,256,64,0,f16,f16,0,1 +106,4608,256,64,0,f16,f16,0,1 +106,7168,256,64,0,f16,f16,0,1 +106,576,512,64,0,f16,f16,0,1 +106,1536,512,64,0,f16,f16,0,1 +106,3072,512,64,0,f16,f16,0,1 +106,4096,512,64,0,f16,f16,0,1 +106,4608,512,64,0,f16,f16,0,1 +106,7168,512,64,0,f16,f16,0,1 +106,576,1536,64,0,f16,f16,0,1 +106,1536,1536,64,0,f16,f16,0,1 +106,3072,1536,64,0,f16,f16,0,1 +106,4096,1536,64,0,f16,f16,0,1 +106,4608,1536,64,0,f16,f16,0,1 +106,7168,1536,64,0,f16,f16,0,1 +106,512,2048,64,0,f16,f16,0,1 +106,1536,2048,64,0,f16,f16,0,1 +106,3072,2048,64,0,f16,f16,0,1 +106,4096,2048,64,0,f16,f16,0,1 +106,4608,2048,64,0,f16,f16,0,1 +106,7168,2048,64,0,f16,f16,0,1 +106,512,2304,64,0,f16,f16,0,1 +106,1536,2304,64,0,f16,f16,0,1 +106,3072,2304,64,0,f16,f16,0,1 +106,4096,2304,64,0,f16,f16,0,1 +106,4608,2304,64,0,f16,f16,0,1 +106,7168,2304,64,0,f16,f16,0,1 +106,512,7168,64,0,f16,f16,0,1 +106,1536,7168,64,0,f16,f16,0,1 +106,3072,7168,64,0,f16,f16,0,1 +106,4096,7168,64,0,f16,f16,0,1 +106,4608,7168,64,0,f16,f16,0,1 +106,7168,7168,64,0,f16,f16,0,1 +107,576,256,64,0,f16,f16,0,1 +107,1536,256,64,0,f16,f16,0,1 +107,3072,256,64,0,f16,f16,0,1 +107,4096,256,64,0,f16,f16,0,1 +107,4608,256,64,0,f16,f16,0,1 +107,7168,256,64,0,f16,f16,0,1 +107,576,512,64,0,f16,f16,0,1 +107,1536,512,64,0,f16,f16,0,1 +107,3072,512,64,0,f16,f16,0,1 +107,4096,512,64,0,f16,f16,0,1 +107,4608,512,64,0,f16,f16,0,1 +107,7168,512,64,0,f16,f16,0,1 +107,576,1536,64,0,f16,f16,0,1 +107,1536,1536,64,0,f16,f16,0,1 +107,3072,1536,64,0,f16,f16,0,1 +107,4096,1536,64,0,f16,f16,0,1 +107,4608,1536,64,0,f16,f16,0,1 +107,7168,1536,64,0,f16,f16,0,1 +107,512,2048,64,0,f16,f16,0,1 +107,1536,2048,64,0,f16,f16,0,1 +107,3072,2048,64,0,f16,f16,0,1 +107,4096,2048,64,0,f16,f16,0,1 +107,4608,2048,64,0,f16,f16,0,1 +107,7168,2048,64,0,f16,f16,0,1 +107,512,2304,64,0,f16,f16,0,1 +107,1536,2304,64,0,f16,f16,0,1 +107,3072,2304,64,0,f16,f16,0,1 +107,4096,2304,64,0,f16,f16,0,1 +107,4608,2304,64,0,f16,f16,0,1 +107,7168,2304,64,0,f16,f16,0,1 +107,512,7168,64,0,f16,f16,0,1 +107,1536,7168,64,0,f16,f16,0,1 +107,3072,7168,64,0,f16,f16,0,1 +107,4096,7168,64,0,f16,f16,0,1 +107,4608,7168,64,0,f16,f16,0,1 +107,7168,7168,64,0,f16,f16,0,1 +108,576,256,64,0,f16,f16,0,1 +108,1536,256,64,0,f16,f16,0,1 +108,3072,256,64,0,f16,f16,0,1 +108,4096,256,64,0,f16,f16,0,1 +108,4608,256,64,0,f16,f16,0,1 +108,7168,256,64,0,f16,f16,0,1 +108,576,512,64,0,f16,f16,0,1 +108,1536,512,64,0,f16,f16,0,1 +108,3072,512,64,0,f16,f16,0,1 +108,4096,512,64,0,f16,f16,0,1 +108,4608,512,64,0,f16,f16,0,1 +108,7168,512,64,0,f16,f16,0,1 +108,576,1536,64,0,f16,f16,0,1 +108,1536,1536,64,0,f16,f16,0,1 +108,3072,1536,64,0,f16,f16,0,1 +108,4096,1536,64,0,f16,f16,0,1 +108,4608,1536,64,0,f16,f16,0,1 +108,7168,1536,64,0,f16,f16,0,1 +108,512,2048,64,0,f16,f16,0,1 +108,1536,2048,64,0,f16,f16,0,1 +108,3072,2048,64,0,f16,f16,0,1 +108,4096,2048,64,0,f16,f16,0,1 +108,4608,2048,64,0,f16,f16,0,1 +108,7168,2048,64,0,f16,f16,0,1 +108,512,2304,64,0,f16,f16,0,1 +108,1536,2304,64,0,f16,f16,0,1 +108,3072,2304,64,0,f16,f16,0,1 +108,4096,2304,64,0,f16,f16,0,1 +108,4608,2304,64,0,f16,f16,0,1 +108,7168,2304,64,0,f16,f16,0,1 +108,512,7168,64,0,f16,f16,0,1 +108,1536,7168,64,0,f16,f16,0,1 +108,3072,7168,64,0,f16,f16,0,1 +108,4096,7168,64,0,f16,f16,0,1 +108,4608,7168,64,0,f16,f16,0,1 +108,7168,7168,64,0,f16,f16,0,1 +109,576,256,64,0,f16,f16,0,1 +109,1536,256,64,0,f16,f16,0,1 +109,3072,256,64,0,f16,f16,0,1 +109,4096,256,64,0,f16,f16,0,1 +109,4608,256,64,0,f16,f16,0,1 +109,7168,256,64,0,f16,f16,0,1 +109,576,512,64,0,f16,f16,0,1 +109,1536,512,64,0,f16,f16,0,1 +109,3072,512,64,0,f16,f16,0,1 +109,4096,512,64,0,f16,f16,0,1 +109,4608,512,64,0,f16,f16,0,1 +109,7168,512,64,0,f16,f16,0,1 +109,576,1536,64,0,f16,f16,0,1 +109,1536,1536,64,0,f16,f16,0,1 +109,3072,1536,64,0,f16,f16,0,1 +109,4096,1536,64,0,f16,f16,0,1 +109,4608,1536,64,0,f16,f16,0,1 +109,7168,1536,64,0,f16,f16,0,1 +109,512,2048,64,0,f16,f16,0,1 +109,1536,2048,64,0,f16,f16,0,1 +109,3072,2048,64,0,f16,f16,0,1 +109,4096,2048,64,0,f16,f16,0,1 +109,4608,2048,64,0,f16,f16,0,1 +109,7168,2048,64,0,f16,f16,0,1 +109,512,2304,64,0,f16,f16,0,1 +109,1536,2304,64,0,f16,f16,0,1 +109,3072,2304,64,0,f16,f16,0,1 +109,4096,2304,64,0,f16,f16,0,1 +109,4608,2304,64,0,f16,f16,0,1 +109,7168,2304,64,0,f16,f16,0,1 +109,512,7168,64,0,f16,f16,0,1 +109,1536,7168,64,0,f16,f16,0,1 +109,3072,7168,64,0,f16,f16,0,1 +109,4096,7168,64,0,f16,f16,0,1 +109,4608,7168,64,0,f16,f16,0,1 +109,7168,7168,64,0,f16,f16,0,1 +110,576,256,64,0,f16,f16,0,1 +110,1536,256,64,0,f16,f16,0,1 +110,3072,256,64,0,f16,f16,0,1 +110,4096,256,64,0,f16,f16,0,1 +110,4608,256,64,0,f16,f16,0,1 +110,7168,256,64,0,f16,f16,0,1 +110,576,512,64,0,f16,f16,0,1 +110,1536,512,64,0,f16,f16,0,1 +110,3072,512,64,0,f16,f16,0,1 +110,4096,512,64,0,f16,f16,0,1 +110,4608,512,64,0,f16,f16,0,1 +110,7168,512,64,0,f16,f16,0,1 +110,576,1536,64,0,f16,f16,0,1 +110,1536,1536,64,0,f16,f16,0,1 +110,3072,1536,64,0,f16,f16,0,1 +110,4096,1536,64,0,f16,f16,0,1 +110,4608,1536,64,0,f16,f16,0,1 +110,7168,1536,64,0,f16,f16,0,1 +110,512,2048,64,0,f16,f16,0,1 +110,1536,2048,64,0,f16,f16,0,1 +110,3072,2048,64,0,f16,f16,0,1 +110,4096,2048,64,0,f16,f16,0,1 +110,4608,2048,64,0,f16,f16,0,1 +110,7168,2048,64,0,f16,f16,0,1 +110,512,2304,64,0,f16,f16,0,1 +110,1536,2304,64,0,f16,f16,0,1 +110,3072,2304,64,0,f16,f16,0,1 +110,4096,2304,64,0,f16,f16,0,1 +110,4608,2304,64,0,f16,f16,0,1 +110,7168,2304,64,0,f16,f16,0,1 +110,512,7168,64,0,f16,f16,0,1 +110,1536,7168,64,0,f16,f16,0,1 +110,3072,7168,64,0,f16,f16,0,1 +110,4096,7168,64,0,f16,f16,0,1 +110,4608,7168,64,0,f16,f16,0,1 +110,7168,7168,64,0,f16,f16,0,1 +111,576,256,64,0,f16,f16,0,1 +111,1536,256,64,0,f16,f16,0,1 +111,3072,256,64,0,f16,f16,0,1 +111,4096,256,64,0,f16,f16,0,1 +111,4608,256,64,0,f16,f16,0,1 +111,7168,256,64,0,f16,f16,0,1 +111,576,512,64,0,f16,f16,0,1 +111,1536,512,64,0,f16,f16,0,1 +111,3072,512,64,0,f16,f16,0,1 +111,4096,512,64,0,f16,f16,0,1 +111,4608,512,64,0,f16,f16,0,1 +111,7168,512,64,0,f16,f16,0,1 +111,576,1536,64,0,f16,f16,0,1 +111,1536,1536,64,0,f16,f16,0,1 +111,3072,1536,64,0,f16,f16,0,1 +111,4096,1536,64,0,f16,f16,0,1 +111,4608,1536,64,0,f16,f16,0,1 +111,7168,1536,64,0,f16,f16,0,1 +111,512,2048,64,0,f16,f16,0,1 +111,1536,2048,64,0,f16,f16,0,1 +111,3072,2048,64,0,f16,f16,0,1 +111,4096,2048,64,0,f16,f16,0,1 +111,4608,2048,64,0,f16,f16,0,1 +111,7168,2048,64,0,f16,f16,0,1 +111,512,2304,64,0,f16,f16,0,1 +111,1536,2304,64,0,f16,f16,0,1 +111,3072,2304,64,0,f16,f16,0,1 +111,4096,2304,64,0,f16,f16,0,1 +111,4608,2304,64,0,f16,f16,0,1 +111,7168,2304,64,0,f16,f16,0,1 +111,512,7168,64,0,f16,f16,0,1 +111,1536,7168,64,0,f16,f16,0,1 +111,3072,7168,64,0,f16,f16,0,1 +111,4096,7168,64,0,f16,f16,0,1 +111,4608,7168,64,0,f16,f16,0,1 +111,7168,7168,64,0,f16,f16,0,1 +112,576,256,64,0,f16,f16,0,1 +112,1536,256,64,0,f16,f16,0,1 +112,3072,256,64,0,f16,f16,0,1 +112,4096,256,64,0,f16,f16,0,1 +112,4608,256,64,0,f16,f16,0,1 +112,7168,256,64,0,f16,f16,0,1 +112,576,512,64,0,f16,f16,0,1 +112,1536,512,64,0,f16,f16,0,1 +112,3072,512,64,0,f16,f16,0,1 +112,4096,512,64,0,f16,f16,0,1 +112,4608,512,64,0,f16,f16,0,1 +112,7168,512,64,0,f16,f16,0,1 +112,576,1536,64,0,f16,f16,0,1 +112,1536,1536,64,0,f16,f16,0,1 +112,3072,1536,64,0,f16,f16,0,1 +112,4096,1536,64,0,f16,f16,0,1 +112,4608,1536,64,0,f16,f16,0,1 +112,7168,1536,64,0,f16,f16,0,1 +112,512,2048,64,0,f16,f16,0,1 +112,1536,2048,64,0,f16,f16,0,1 +112,3072,2048,64,0,f16,f16,0,1 +112,4096,2048,64,0,f16,f16,0,1 +112,4608,2048,64,0,f16,f16,0,1 +112,7168,2048,64,0,f16,f16,0,1 +112,512,2304,64,0,f16,f16,0,1 +112,1536,2304,64,0,f16,f16,0,1 +112,3072,2304,64,0,f16,f16,0,1 +112,4096,2304,64,0,f16,f16,0,1 +112,4608,2304,64,0,f16,f16,0,1 +112,7168,2304,64,0,f16,f16,0,1 +112,512,7168,64,0,f16,f16,0,1 +112,1536,7168,64,0,f16,f16,0,1 +112,3072,7168,64,0,f16,f16,0,1 +112,4096,7168,64,0,f16,f16,0,1 +112,4608,7168,64,0,f16,f16,0,1 +112,7168,7168,64,0,f16,f16,0,1 +113,576,256,64,0,f16,f16,0,1 +113,1536,256,64,0,f16,f16,0,1 +113,3072,256,64,0,f16,f16,0,1 +113,4096,256,64,0,f16,f16,0,1 +113,4608,256,64,0,f16,f16,0,1 +113,7168,256,64,0,f16,f16,0,1 +113,576,512,64,0,f16,f16,0,1 +113,1536,512,64,0,f16,f16,0,1 +113,3072,512,64,0,f16,f16,0,1 +113,4096,512,64,0,f16,f16,0,1 +113,4608,512,64,0,f16,f16,0,1 +113,7168,512,64,0,f16,f16,0,1 +113,576,1536,64,0,f16,f16,0,1 +113,1536,1536,64,0,f16,f16,0,1 +113,3072,1536,64,0,f16,f16,0,1 +113,4096,1536,64,0,f16,f16,0,1 +113,4608,1536,64,0,f16,f16,0,1 +113,7168,1536,64,0,f16,f16,0,1 +113,512,2048,64,0,f16,f16,0,1 +113,1536,2048,64,0,f16,f16,0,1 +113,3072,2048,64,0,f16,f16,0,1 +113,4096,2048,64,0,f16,f16,0,1 +113,4608,2048,64,0,f16,f16,0,1 +113,7168,2048,64,0,f16,f16,0,1 +113,512,2304,64,0,f16,f16,0,1 +113,1536,2304,64,0,f16,f16,0,1 +113,3072,2304,64,0,f16,f16,0,1 +113,4096,2304,64,0,f16,f16,0,1 +113,4608,2304,64,0,f16,f16,0,1 +113,7168,2304,64,0,f16,f16,0,1 +113,512,7168,64,0,f16,f16,0,1 +113,1536,7168,64,0,f16,f16,0,1 +113,3072,7168,64,0,f16,f16,0,1 +113,4096,7168,64,0,f16,f16,0,1 +113,4608,7168,64,0,f16,f16,0,1 +113,7168,7168,64,0,f16,f16,0,1 +114,576,256,64,0,f16,f16,0,1 +114,1536,256,64,0,f16,f16,0,1 +114,3072,256,64,0,f16,f16,0,1 +114,4096,256,64,0,f16,f16,0,1 +114,4608,256,64,0,f16,f16,0,1 +114,7168,256,64,0,f16,f16,0,1 +114,576,512,64,0,f16,f16,0,1 +114,1536,512,64,0,f16,f16,0,1 +114,3072,512,64,0,f16,f16,0,1 +114,4096,512,64,0,f16,f16,0,1 +114,4608,512,64,0,f16,f16,0,1 +114,7168,512,64,0,f16,f16,0,1 +114,576,1536,64,0,f16,f16,0,1 +114,1536,1536,64,0,f16,f16,0,1 +114,3072,1536,64,0,f16,f16,0,1 +114,4096,1536,64,0,f16,f16,0,1 +114,4608,1536,64,0,f16,f16,0,1 +114,7168,1536,64,0,f16,f16,0,1 +114,512,2048,64,0,f16,f16,0,1 +114,1536,2048,64,0,f16,f16,0,1 +114,3072,2048,64,0,f16,f16,0,1 +114,4096,2048,64,0,f16,f16,0,1 +114,4608,2048,64,0,f16,f16,0,1 +114,7168,2048,64,0,f16,f16,0,1 +114,512,2304,64,0,f16,f16,0,1 +114,1536,2304,64,0,f16,f16,0,1 +114,3072,2304,64,0,f16,f16,0,1 +114,4096,2304,64,0,f16,f16,0,1 +114,4608,2304,64,0,f16,f16,0,1 +114,7168,2304,64,0,f16,f16,0,1 +114,512,7168,64,0,f16,f16,0,1 +114,1536,7168,64,0,f16,f16,0,1 +114,3072,7168,64,0,f16,f16,0,1 +114,4096,7168,64,0,f16,f16,0,1 +114,4608,7168,64,0,f16,f16,0,1 +114,7168,7168,64,0,f16,f16,0,1 +115,576,256,64,0,f16,f16,0,1 +115,1536,256,64,0,f16,f16,0,1 +115,3072,256,64,0,f16,f16,0,1 +115,4096,256,64,0,f16,f16,0,1 +115,4608,256,64,0,f16,f16,0,1 +115,7168,256,64,0,f16,f16,0,1 +115,576,512,64,0,f16,f16,0,1 +115,1536,512,64,0,f16,f16,0,1 +115,3072,512,64,0,f16,f16,0,1 +115,4096,512,64,0,f16,f16,0,1 +115,4608,512,64,0,f16,f16,0,1 +115,7168,512,64,0,f16,f16,0,1 +115,576,1536,64,0,f16,f16,0,1 +115,1536,1536,64,0,f16,f16,0,1 +115,3072,1536,64,0,f16,f16,0,1 +115,4096,1536,64,0,f16,f16,0,1 +115,4608,1536,64,0,f16,f16,0,1 +115,7168,1536,64,0,f16,f16,0,1 +115,512,2048,64,0,f16,f16,0,1 +115,1536,2048,64,0,f16,f16,0,1 +115,3072,2048,64,0,f16,f16,0,1 +115,4096,2048,64,0,f16,f16,0,1 +115,4608,2048,64,0,f16,f16,0,1 +115,7168,2048,64,0,f16,f16,0,1 +115,512,2304,64,0,f16,f16,0,1 +115,1536,2304,64,0,f16,f16,0,1 +115,3072,2304,64,0,f16,f16,0,1 +115,4096,2304,64,0,f16,f16,0,1 +115,4608,2304,64,0,f16,f16,0,1 +115,7168,2304,64,0,f16,f16,0,1 +115,512,7168,64,0,f16,f16,0,1 +115,1536,7168,64,0,f16,f16,0,1 +115,3072,7168,64,0,f16,f16,0,1 +115,4096,7168,64,0,f16,f16,0,1 +115,4608,7168,64,0,f16,f16,0,1 +115,7168,7168,64,0,f16,f16,0,1 +116,576,256,64,0,f16,f16,0,1 +116,1536,256,64,0,f16,f16,0,1 +116,3072,256,64,0,f16,f16,0,1 +116,4096,256,64,0,f16,f16,0,1 +116,4608,256,64,0,f16,f16,0,1 +116,7168,256,64,0,f16,f16,0,1 +116,576,512,64,0,f16,f16,0,1 +116,1536,512,64,0,f16,f16,0,1 +116,3072,512,64,0,f16,f16,0,1 +116,4096,512,64,0,f16,f16,0,1 +116,4608,512,64,0,f16,f16,0,1 +116,7168,512,64,0,f16,f16,0,1 +116,576,1536,64,0,f16,f16,0,1 +116,1536,1536,64,0,f16,f16,0,1 +116,3072,1536,64,0,f16,f16,0,1 +116,4096,1536,64,0,f16,f16,0,1 +116,4608,1536,64,0,f16,f16,0,1 +116,7168,1536,64,0,f16,f16,0,1 +116,512,2048,64,0,f16,f16,0,1 +116,1536,2048,64,0,f16,f16,0,1 +116,3072,2048,64,0,f16,f16,0,1 +116,4096,2048,64,0,f16,f16,0,1 +116,4608,2048,64,0,f16,f16,0,1 +116,7168,2048,64,0,f16,f16,0,1 +116,512,2304,64,0,f16,f16,0,1 +116,1536,2304,64,0,f16,f16,0,1 +116,3072,2304,64,0,f16,f16,0,1 +116,4096,2304,64,0,f16,f16,0,1 +116,4608,2304,64,0,f16,f16,0,1 +116,7168,2304,64,0,f16,f16,0,1 +116,512,7168,64,0,f16,f16,0,1 +116,1536,7168,64,0,f16,f16,0,1 +116,3072,7168,64,0,f16,f16,0,1 +116,4096,7168,64,0,f16,f16,0,1 +116,4608,7168,64,0,f16,f16,0,1 +116,7168,7168,64,0,f16,f16,0,1 +117,576,256,64,0,f16,f16,0,1 +117,1536,256,64,0,f16,f16,0,1 +117,3072,256,64,0,f16,f16,0,1 +117,4096,256,64,0,f16,f16,0,1 +117,4608,256,64,0,f16,f16,0,1 +117,7168,256,64,0,f16,f16,0,1 +117,576,512,64,0,f16,f16,0,1 +117,1536,512,64,0,f16,f16,0,1 +117,3072,512,64,0,f16,f16,0,1 +117,4096,512,64,0,f16,f16,0,1 +117,4608,512,64,0,f16,f16,0,1 +117,7168,512,64,0,f16,f16,0,1 +117,576,1536,64,0,f16,f16,0,1 +117,1536,1536,64,0,f16,f16,0,1 +117,3072,1536,64,0,f16,f16,0,1 +117,4096,1536,64,0,f16,f16,0,1 +117,4608,1536,64,0,f16,f16,0,1 +117,7168,1536,64,0,f16,f16,0,1 +117,512,2048,64,0,f16,f16,0,1 +117,1536,2048,64,0,f16,f16,0,1 +117,3072,2048,64,0,f16,f16,0,1 +117,4096,2048,64,0,f16,f16,0,1 +117,4608,2048,64,0,f16,f16,0,1 +117,7168,2048,64,0,f16,f16,0,1 +117,512,2304,64,0,f16,f16,0,1 +117,1536,2304,64,0,f16,f16,0,1 +117,3072,2304,64,0,f16,f16,0,1 +117,4096,2304,64,0,f16,f16,0,1 +117,4608,2304,64,0,f16,f16,0,1 +117,7168,2304,64,0,f16,f16,0,1 +117,512,7168,64,0,f16,f16,0,1 +117,1536,7168,64,0,f16,f16,0,1 +117,3072,7168,64,0,f16,f16,0,1 +117,4096,7168,64,0,f16,f16,0,1 +117,4608,7168,64,0,f16,f16,0,1 +117,7168,7168,64,0,f16,f16,0,1 +118,576,256,64,0,f16,f16,0,1 +118,1536,256,64,0,f16,f16,0,1 +118,3072,256,64,0,f16,f16,0,1 +118,4096,256,64,0,f16,f16,0,1 +118,4608,256,64,0,f16,f16,0,1 +118,7168,256,64,0,f16,f16,0,1 +118,576,512,64,0,f16,f16,0,1 +118,1536,512,64,0,f16,f16,0,1 +118,3072,512,64,0,f16,f16,0,1 +118,4096,512,64,0,f16,f16,0,1 +118,4608,512,64,0,f16,f16,0,1 +118,7168,512,64,0,f16,f16,0,1 +118,576,1536,64,0,f16,f16,0,1 +118,1536,1536,64,0,f16,f16,0,1 +118,3072,1536,64,0,f16,f16,0,1 +118,4096,1536,64,0,f16,f16,0,1 +118,4608,1536,64,0,f16,f16,0,1 +118,7168,1536,64,0,f16,f16,0,1 +118,512,2048,64,0,f16,f16,0,1 +118,1536,2048,64,0,f16,f16,0,1 +118,3072,2048,64,0,f16,f16,0,1 +118,4096,2048,64,0,f16,f16,0,1 +118,4608,2048,64,0,f16,f16,0,1 +118,7168,2048,64,0,f16,f16,0,1 +118,512,2304,64,0,f16,f16,0,1 +118,1536,2304,64,0,f16,f16,0,1 +118,3072,2304,64,0,f16,f16,0,1 +118,4096,2304,64,0,f16,f16,0,1 +118,4608,2304,64,0,f16,f16,0,1 +118,7168,2304,64,0,f16,f16,0,1 +118,512,7168,64,0,f16,f16,0,1 +118,1536,7168,64,0,f16,f16,0,1 +118,3072,7168,64,0,f16,f16,0,1 +118,4096,7168,64,0,f16,f16,0,1 +118,4608,7168,64,0,f16,f16,0,1 +118,7168,7168,64,0,f16,f16,0,1 +119,576,256,64,0,f16,f16,0,1 +119,1536,256,64,0,f16,f16,0,1 +119,3072,256,64,0,f16,f16,0,1 +119,4096,256,64,0,f16,f16,0,1 +119,4608,256,64,0,f16,f16,0,1 +119,7168,256,64,0,f16,f16,0,1 +119,576,512,64,0,f16,f16,0,1 +119,1536,512,64,0,f16,f16,0,1 +119,3072,512,64,0,f16,f16,0,1 +119,4096,512,64,0,f16,f16,0,1 +119,4608,512,64,0,f16,f16,0,1 +119,7168,512,64,0,f16,f16,0,1 +119,576,1536,64,0,f16,f16,0,1 +119,1536,1536,64,0,f16,f16,0,1 +119,3072,1536,64,0,f16,f16,0,1 +119,4096,1536,64,0,f16,f16,0,1 +119,4608,1536,64,0,f16,f16,0,1 +119,7168,1536,64,0,f16,f16,0,1 +119,512,2048,64,0,f16,f16,0,1 +119,1536,2048,64,0,f16,f16,0,1 +119,3072,2048,64,0,f16,f16,0,1 +119,4096,2048,64,0,f16,f16,0,1 +119,4608,2048,64,0,f16,f16,0,1 +119,7168,2048,64,0,f16,f16,0,1 +119,512,2304,64,0,f16,f16,0,1 +119,1536,2304,64,0,f16,f16,0,1 +119,3072,2304,64,0,f16,f16,0,1 +119,4096,2304,64,0,f16,f16,0,1 +119,4608,2304,64,0,f16,f16,0,1 +119,7168,2304,64,0,f16,f16,0,1 +119,512,7168,64,0,f16,f16,0,1 +119,1536,7168,64,0,f16,f16,0,1 +119,3072,7168,64,0,f16,f16,0,1 +119,4096,7168,64,0,f16,f16,0,1 +119,4608,7168,64,0,f16,f16,0,1 +119,7168,7168,64,0,f16,f16,0,1 +120,576,256,64,0,f16,f16,0,1 +120,1536,256,64,0,f16,f16,0,1 +120,3072,256,64,0,f16,f16,0,1 +120,4096,256,64,0,f16,f16,0,1 +120,4608,256,64,0,f16,f16,0,1 +120,7168,256,64,0,f16,f16,0,1 +120,576,512,64,0,f16,f16,0,1 +120,1536,512,64,0,f16,f16,0,1 +120,3072,512,64,0,f16,f16,0,1 +120,4096,512,64,0,f16,f16,0,1 +120,4608,512,64,0,f16,f16,0,1 +120,7168,512,64,0,f16,f16,0,1 +120,576,1536,64,0,f16,f16,0,1 +120,1536,1536,64,0,f16,f16,0,1 +120,3072,1536,64,0,f16,f16,0,1 +120,4096,1536,64,0,f16,f16,0,1 +120,4608,1536,64,0,f16,f16,0,1 +120,7168,1536,64,0,f16,f16,0,1 +120,512,2048,64,0,f16,f16,0,1 +120,1536,2048,64,0,f16,f16,0,1 +120,3072,2048,64,0,f16,f16,0,1 +120,4096,2048,64,0,f16,f16,0,1 +120,4608,2048,64,0,f16,f16,0,1 +120,7168,2048,64,0,f16,f16,0,1 +120,512,2304,64,0,f16,f16,0,1 +120,1536,2304,64,0,f16,f16,0,1 +120,3072,2304,64,0,f16,f16,0,1 +120,4096,2304,64,0,f16,f16,0,1 +120,4608,2304,64,0,f16,f16,0,1 +120,7168,2304,64,0,f16,f16,0,1 +120,512,7168,64,0,f16,f16,0,1 +120,1536,7168,64,0,f16,f16,0,1 +120,3072,7168,64,0,f16,f16,0,1 +120,4096,7168,64,0,f16,f16,0,1 +120,4608,7168,64,0,f16,f16,0,1 +120,7168,7168,64,0,f16,f16,0,1 +121,576,256,64,0,f16,f16,0,1 +121,1536,256,64,0,f16,f16,0,1 +121,3072,256,64,0,f16,f16,0,1 +121,4096,256,64,0,f16,f16,0,1 +121,4608,256,64,0,f16,f16,0,1 +121,7168,256,64,0,f16,f16,0,1 +121,576,512,64,0,f16,f16,0,1 +121,1536,512,64,0,f16,f16,0,1 +121,3072,512,64,0,f16,f16,0,1 +121,4096,512,64,0,f16,f16,0,1 +121,4608,512,64,0,f16,f16,0,1 +121,7168,512,64,0,f16,f16,0,1 +121,576,1536,64,0,f16,f16,0,1 +121,1536,1536,64,0,f16,f16,0,1 +121,3072,1536,64,0,f16,f16,0,1 +121,4096,1536,64,0,f16,f16,0,1 +121,4608,1536,64,0,f16,f16,0,1 +121,7168,1536,64,0,f16,f16,0,1 +121,512,2048,64,0,f16,f16,0,1 +121,1536,2048,64,0,f16,f16,0,1 +121,3072,2048,64,0,f16,f16,0,1 +121,4096,2048,64,0,f16,f16,0,1 +121,4608,2048,64,0,f16,f16,0,1 +121,7168,2048,64,0,f16,f16,0,1 +121,512,2304,64,0,f16,f16,0,1 +121,1536,2304,64,0,f16,f16,0,1 +121,3072,2304,64,0,f16,f16,0,1 +121,4096,2304,64,0,f16,f16,0,1 +121,4608,2304,64,0,f16,f16,0,1 +121,7168,2304,64,0,f16,f16,0,1 +121,512,7168,64,0,f16,f16,0,1 +121,1536,7168,64,0,f16,f16,0,1 +121,3072,7168,64,0,f16,f16,0,1 +121,4096,7168,64,0,f16,f16,0,1 +121,4608,7168,64,0,f16,f16,0,1 +121,7168,7168,64,0,f16,f16,0,1 +122,576,256,64,0,f16,f16,0,1 +122,1536,256,64,0,f16,f16,0,1 +122,3072,256,64,0,f16,f16,0,1 +122,4096,256,64,0,f16,f16,0,1 +122,4608,256,64,0,f16,f16,0,1 +122,7168,256,64,0,f16,f16,0,1 +122,576,512,64,0,f16,f16,0,1 +122,1536,512,64,0,f16,f16,0,1 +122,3072,512,64,0,f16,f16,0,1 +122,4096,512,64,0,f16,f16,0,1 +122,4608,512,64,0,f16,f16,0,1 +122,7168,512,64,0,f16,f16,0,1 +122,576,1536,64,0,f16,f16,0,1 +122,1536,1536,64,0,f16,f16,0,1 +122,3072,1536,64,0,f16,f16,0,1 +122,4096,1536,64,0,f16,f16,0,1 +122,4608,1536,64,0,f16,f16,0,1 +122,7168,1536,64,0,f16,f16,0,1 +122,512,2048,64,0,f16,f16,0,1 +122,1536,2048,64,0,f16,f16,0,1 +122,3072,2048,64,0,f16,f16,0,1 +122,4096,2048,64,0,f16,f16,0,1 +122,4608,2048,64,0,f16,f16,0,1 +122,7168,2048,64,0,f16,f16,0,1 +122,512,2304,64,0,f16,f16,0,1 +122,1536,2304,64,0,f16,f16,0,1 +122,3072,2304,64,0,f16,f16,0,1 +122,4096,2304,64,0,f16,f16,0,1 +122,4608,2304,64,0,f16,f16,0,1 +122,7168,2304,64,0,f16,f16,0,1 +122,512,7168,64,0,f16,f16,0,1 +122,1536,7168,64,0,f16,f16,0,1 +122,3072,7168,64,0,f16,f16,0,1 +122,4096,7168,64,0,f16,f16,0,1 +122,4608,7168,64,0,f16,f16,0,1 +122,7168,7168,64,0,f16,f16,0,1 +123,576,256,64,0,f16,f16,0,1 +123,1536,256,64,0,f16,f16,0,1 +123,3072,256,64,0,f16,f16,0,1 +123,4096,256,64,0,f16,f16,0,1 +123,4608,256,64,0,f16,f16,0,1 +123,7168,256,64,0,f16,f16,0,1 +123,576,512,64,0,f16,f16,0,1 +123,1536,512,64,0,f16,f16,0,1 +123,3072,512,64,0,f16,f16,0,1 +123,4096,512,64,0,f16,f16,0,1 +123,4608,512,64,0,f16,f16,0,1 +123,7168,512,64,0,f16,f16,0,1 +123,576,1536,64,0,f16,f16,0,1 +123,1536,1536,64,0,f16,f16,0,1 +123,3072,1536,64,0,f16,f16,0,1 +123,4096,1536,64,0,f16,f16,0,1 +123,4608,1536,64,0,f16,f16,0,1 +123,7168,1536,64,0,f16,f16,0,1 +123,512,2048,64,0,f16,f16,0,1 +123,1536,2048,64,0,f16,f16,0,1 +123,3072,2048,64,0,f16,f16,0,1 +123,4096,2048,64,0,f16,f16,0,1 +123,4608,2048,64,0,f16,f16,0,1 +123,7168,2048,64,0,f16,f16,0,1 +123,512,2304,64,0,f16,f16,0,1 +123,1536,2304,64,0,f16,f16,0,1 +123,3072,2304,64,0,f16,f16,0,1 +123,4096,2304,64,0,f16,f16,0,1 +123,4608,2304,64,0,f16,f16,0,1 +123,7168,2304,64,0,f16,f16,0,1 +123,512,7168,64,0,f16,f16,0,1 +123,1536,7168,64,0,f16,f16,0,1 +123,3072,7168,64,0,f16,f16,0,1 +123,4096,7168,64,0,f16,f16,0,1 +123,4608,7168,64,0,f16,f16,0,1 +123,7168,7168,64,0,f16,f16,0,1 +124,576,256,64,0,f16,f16,0,1 +124,1536,256,64,0,f16,f16,0,1 +124,3072,256,64,0,f16,f16,0,1 +124,4096,256,64,0,f16,f16,0,1 +124,4608,256,64,0,f16,f16,0,1 +124,7168,256,64,0,f16,f16,0,1 +124,576,512,64,0,f16,f16,0,1 +124,1536,512,64,0,f16,f16,0,1 +124,3072,512,64,0,f16,f16,0,1 +124,4096,512,64,0,f16,f16,0,1 +124,4608,512,64,0,f16,f16,0,1 +124,7168,512,64,0,f16,f16,0,1 +124,576,1536,64,0,f16,f16,0,1 +124,1536,1536,64,0,f16,f16,0,1 +124,3072,1536,64,0,f16,f16,0,1 +124,4096,1536,64,0,f16,f16,0,1 +124,4608,1536,64,0,f16,f16,0,1 +124,7168,1536,64,0,f16,f16,0,1 +124,512,2048,64,0,f16,f16,0,1 +124,1536,2048,64,0,f16,f16,0,1 +124,3072,2048,64,0,f16,f16,0,1 +124,4096,2048,64,0,f16,f16,0,1 +124,4608,2048,64,0,f16,f16,0,1 +124,7168,2048,64,0,f16,f16,0,1 +124,512,2304,64,0,f16,f16,0,1 +124,1536,2304,64,0,f16,f16,0,1 +124,3072,2304,64,0,f16,f16,0,1 +124,4096,2304,64,0,f16,f16,0,1 +124,4608,2304,64,0,f16,f16,0,1 +124,7168,2304,64,0,f16,f16,0,1 +124,512,7168,64,0,f16,f16,0,1 +124,1536,7168,64,0,f16,f16,0,1 +124,3072,7168,64,0,f16,f16,0,1 +124,4096,7168,64,0,f16,f16,0,1 +124,4608,7168,64,0,f16,f16,0,1 +124,7168,7168,64,0,f16,f16,0,1 +125,576,256,64,0,f16,f16,0,1 +125,1536,256,64,0,f16,f16,0,1 +125,3072,256,64,0,f16,f16,0,1 +125,4096,256,64,0,f16,f16,0,1 +125,4608,256,64,0,f16,f16,0,1 +125,7168,256,64,0,f16,f16,0,1 +125,576,512,64,0,f16,f16,0,1 +125,1536,512,64,0,f16,f16,0,1 +125,3072,512,64,0,f16,f16,0,1 +125,4096,512,64,0,f16,f16,0,1 +125,4608,512,64,0,f16,f16,0,1 +125,7168,512,64,0,f16,f16,0,1 +125,576,1536,64,0,f16,f16,0,1 +125,1536,1536,64,0,f16,f16,0,1 +125,3072,1536,64,0,f16,f16,0,1 +125,4096,1536,64,0,f16,f16,0,1 +125,4608,1536,64,0,f16,f16,0,1 +125,7168,1536,64,0,f16,f16,0,1 +125,512,2048,64,0,f16,f16,0,1 +125,1536,2048,64,0,f16,f16,0,1 +125,3072,2048,64,0,f16,f16,0,1 +125,4096,2048,64,0,f16,f16,0,1 +125,4608,2048,64,0,f16,f16,0,1 +125,7168,2048,64,0,f16,f16,0,1 +125,512,2304,64,0,f16,f16,0,1 +125,1536,2304,64,0,f16,f16,0,1 +125,3072,2304,64,0,f16,f16,0,1 +125,4096,2304,64,0,f16,f16,0,1 +125,4608,2304,64,0,f16,f16,0,1 +125,7168,2304,64,0,f16,f16,0,1 +125,512,7168,64,0,f16,f16,0,1 +125,1536,7168,64,0,f16,f16,0,1 +125,3072,7168,64,0,f16,f16,0,1 +125,4096,7168,64,0,f16,f16,0,1 +125,4608,7168,64,0,f16,f16,0,1 +125,7168,7168,64,0,f16,f16,0,1 +126,576,256,64,0,f16,f16,0,1 +126,1536,256,64,0,f16,f16,0,1 +126,3072,256,64,0,f16,f16,0,1 +126,4096,256,64,0,f16,f16,0,1 +126,4608,256,64,0,f16,f16,0,1 +126,7168,256,64,0,f16,f16,0,1 +126,576,512,64,0,f16,f16,0,1 +126,1536,512,64,0,f16,f16,0,1 +126,3072,512,64,0,f16,f16,0,1 +126,4096,512,64,0,f16,f16,0,1 +126,4608,512,64,0,f16,f16,0,1 +126,7168,512,64,0,f16,f16,0,1 +126,576,1536,64,0,f16,f16,0,1 +126,1536,1536,64,0,f16,f16,0,1 +126,3072,1536,64,0,f16,f16,0,1 +126,4096,1536,64,0,f16,f16,0,1 +126,4608,1536,64,0,f16,f16,0,1 +126,7168,1536,64,0,f16,f16,0,1 +126,512,2048,64,0,f16,f16,0,1 +126,1536,2048,64,0,f16,f16,0,1 +126,3072,2048,64,0,f16,f16,0,1 +126,4096,2048,64,0,f16,f16,0,1 +126,4608,2048,64,0,f16,f16,0,1 +126,7168,2048,64,0,f16,f16,0,1 +126,512,2304,64,0,f16,f16,0,1 +126,1536,2304,64,0,f16,f16,0,1 +126,3072,2304,64,0,f16,f16,0,1 +126,4096,2304,64,0,f16,f16,0,1 +126,4608,2304,64,0,f16,f16,0,1 +126,7168,2304,64,0,f16,f16,0,1 +126,512,7168,64,0,f16,f16,0,1 +126,1536,7168,64,0,f16,f16,0,1 +126,3072,7168,64,0,f16,f16,0,1 +126,4096,7168,64,0,f16,f16,0,1 +126,4608,7168,64,0,f16,f16,0,1 +126,7168,7168,64,0,f16,f16,0,1 +127,576,256,64,0,f16,f16,0,1 +127,1536,256,64,0,f16,f16,0,1 +127,3072,256,64,0,f16,f16,0,1 +127,4096,256,64,0,f16,f16,0,1 +127,4608,256,64,0,f16,f16,0,1 +127,7168,256,64,0,f16,f16,0,1 +127,576,512,64,0,f16,f16,0,1 +127,1536,512,64,0,f16,f16,0,1 +127,3072,512,64,0,f16,f16,0,1 +127,4096,512,64,0,f16,f16,0,1 +127,4608,512,64,0,f16,f16,0,1 +127,7168,512,64,0,f16,f16,0,1 +127,576,1536,64,0,f16,f16,0,1 +127,1536,1536,64,0,f16,f16,0,1 +127,3072,1536,64,0,f16,f16,0,1 +127,4096,1536,64,0,f16,f16,0,1 +127,4608,1536,64,0,f16,f16,0,1 +127,7168,1536,64,0,f16,f16,0,1 +127,512,2048,64,0,f16,f16,0,1 +127,1536,2048,64,0,f16,f16,0,1 +127,3072,2048,64,0,f16,f16,0,1 +127,4096,2048,64,0,f16,f16,0,1 +127,4608,2048,64,0,f16,f16,0,1 +127,7168,2048,64,0,f16,f16,0,1 +127,512,2304,64,0,f16,f16,0,1 +127,1536,2304,64,0,f16,f16,0,1 +127,3072,2304,64,0,f16,f16,0,1 +127,4096,2304,64,0,f16,f16,0,1 +127,4608,2304,64,0,f16,f16,0,1 +127,7168,2304,64,0,f16,f16,0,1 +127,512,7168,64,0,f16,f16,0,1 +127,1536,7168,64,0,f16,f16,0,1 +127,3072,7168,64,0,f16,f16,0,1 +127,4096,7168,64,0,f16,f16,0,1 +127,4608,7168,64,0,f16,f16,0,1 +127,7168,7168,64,0,f16,f16,0,1 +128,576,256,64,0,f16,f16,0,1 +128,1536,256,64,0,f16,f16,0,1 +128,3072,256,64,0,f16,f16,0,1 +128,4096,256,64,0,f16,f16,0,1 +128,4608,256,64,0,f16,f16,0,1 +128,7168,256,64,0,f16,f16,0,1 +128,576,512,64,0,f16,f16,0,1 +128,1536,512,64,0,f16,f16,0,1 +128,3072,512,64,0,f16,f16,0,1 +128,4096,512,64,0,f16,f16,0,1 +128,4608,512,64,0,f16,f16,0,1 +128,7168,512,64,0,f16,f16,0,1 +128,576,1536,64,0,f16,f16,0,1 +128,1536,1536,64,0,f16,f16,0,1 +128,3072,1536,64,0,f16,f16,0,1 +128,4096,1536,64,0,f16,f16,0,1 +128,4608,1536,64,0,f16,f16,0,1 +128,7168,1536,64,0,f16,f16,0,1 +128,512,2048,64,0,f16,f16,0,1 +128,1536,2048,64,0,f16,f16,0,1 +128,3072,2048,64,0,f16,f16,0,1 +128,4096,2048,64,0,f16,f16,0,1 +128,4608,2048,64,0,f16,f16,0,1 +128,7168,2048,64,0,f16,f16,0,1 +128,512,2304,64,0,f16,f16,0,1 +128,1536,2304,64,0,f16,f16,0,1 +128,3072,2304,64,0,f16,f16,0,1 +128,4096,2304,64,0,f16,f16,0,1 +128,4608,2304,64,0,f16,f16,0,1 +128,7168,2304,64,0,f16,f16,0,1 +128,512,7168,64,0,f16,f16,0,1 +128,1536,7168,64,0,f16,f16,0,1 +128,3072,7168,64,0,f16,f16,0,1 +128,4096,7168,64,0,f16,f16,0,1 +128,4608,7168,64,0,f16,f16,0,1 +128,7168,7168,64,0,f16,f16,0,1 diff --git a/aiter/configs/bf16_untuned_batched_gemm.csv b/aiter/configs/bf16_untuned_batched_gemm.csv new file mode 100644 index 0000000000000000000000000000000000000000..849a7f7697041b866bc72a985a0cfb9e231cca3c --- /dev/null +++ b/aiter/configs/bf16_untuned_batched_gemm.csv @@ -0,0 +1,27 @@ +B,M,N,K +16, 1, 1280, 8192 +16, 32, 1280, 8192 +16, 64, 1280, 8192 +16, 128, 1280, 8192 +16, 192, 1280, 8192 +16, 256, 1280, 8192 +16, 320, 1280, 8192 +16, 512, 1280, 8192 +16, 1024, 1280, 8192 +16, 2048, 1280, 8192 +16, 4096, 1280, 8192 +16, 8192, 1280, 8192 +16, 16384, 1280, 8192 +16, 1, 8192, 1024 +16, 32, 8192, 1024 +16, 64, 8192, 1024 +16, 128, 8192, 1024 +16, 192, 8192, 1024 +16, 256, 8192, 1024 +16, 320, 8192, 1024 +16, 512, 8192, 1024 +16, 1024, 8192, 1024 +16, 2048, 8192, 1024 +16, 4096, 8192, 1024 +16, 8192, 8192, 1024 +16, 16384, 8192, 1024 diff --git a/aiter/configs/ck_tune/tuned_fmoe_ck.csv b/aiter/configs/ck_tune/tuned_fmoe_ck.csv new file mode 100644 index 0000000000000000000000000000000000000000..26b7e55976ac188f7f2ff134fd0e49fc81710b9d --- /dev/null +++ b/aiter/configs/ck_tune/tuned_fmoe_ck.csv @@ -0,0 +1,31 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,no_quant,torch.float16,1,256,8192,256,4,0,0,ck,576,309.8021428571405 +gfx936,no_quant,torch.float16,8,256,8192,256,4,0,0,ck,576,364.40785714285965 +gfx936,no_quant,torch.float16,16,256,8192,256,4,0,0,ck,576,705.001571428565 +gfx936,no_quant,torch.float16,24,256,8192,256,4,0,0,ck,272,1028.2664285714334 +gfx936,no_quant,torch.float16,32,256,8192,256,4,0,0,ck,576,1293.2290000000055 +gfx936,no_quant,torch.float16,48,256,8192,256,4,0,0,ck,272,1877.2707142857207 +gfx936,no_quant,torch.float16,64,256,8192,256,4,0,0,ck,576,2193.247142857142 +gfx936,no_quant,torch.float16,96,256,8192,256,4,0,0,ck,272,2471.966714285715 +gfx936,no_quant,torch.float16,128,256,8192,256,4,0,0,ck,576,2724.7889999999948 +gfx936,no_quant,torch.float16,256,256,8192,256,4,0,0,ck,576,3391.9192857142875 +gfx936,no_quant,torch.float16,512,256,8192,256,4,0,0,ck,576,3537.358999999999 +gfx936,no_quant,torch.float16,1024,256,8192,256,4,0,0,ck,272,5022.04200000001 +gfx936,no_quant,torch.float16,2048,256,8192,256,4,0,0,ck,272,6217.148285714277 +gfx936,no_quant,torch.float16,4096,256,8192,256,4,0,0,ck,272,8764.503571428571 +gfx936,no_quant,torch.float16,8192,256,8192,256,4,0,0,ck,272,16520.489142857128 +gfx936,no_quant,torch.float16,1,256,7168,256,8,0,0,ck,576,274.4422857142844 +gfx936,no_quant,torch.float16,8,256,7168,256,8,0,0,ck,576,584.9560000000014 +gfx936,no_quant,torch.float16,16,256,7168,256,8,0,0,ck,576,1056.360857142859 +gfx936,no_quant,torch.float16,24,256,7168,256,8,0,0,ck,576,1480.196999999999 +gfx936,no_quant,torch.float16,32,256,7168,256,8,0,0,ck,576,1867.970714285712 +gfx936,no_quant,torch.float16,48,256,7168,256,8,0,0,ck,576,2312.378428571425 +gfx936,no_quant,torch.float16,64,256,7168,256,8,0,0,ck,576,2382.001142857145 +gfx936,no_quant,torch.float16,96,256,7168,256,8,0,0,ck,576,2795.691714285705 +gfx936,no_quant,torch.float16,128,256,7168,256,8,0,0,ck,576,2820.2631428571426 +gfx936,no_quant,torch.float16,256,256,7168,256,8,0,0,ck,576,3055.348428571423 +gfx936,no_quant,torch.float16,512,256,7168,256,8,0,0,ck,272,4095.392285714287 +gfx936,no_quant,torch.float16,1024,256,7168,256,8,0,0,ck,272,5514.589714285707 +gfx936,no_quant,torch.float16,2048,256,7168,256,8,0,0,ck,272,7718.288428571438 +gfx936,no_quant,torch.float16,4096,256,7168,256,8,0,0,ck,272,14435.327142857148 +gfx936,no_quant,torch.float16,8192,256,7168,256,8,0,0,ck,272,27862.7415 diff --git a/aiter/configs/ck_tune/tuned_fmoe_ck_int8_w8a8_group.csv b/aiter/configs/ck_tune/tuned_fmoe_ck_int8_w8a8_group.csv new file mode 100644 index 0000000000000000000000000000000000000000..de4a8fd637d804bf6aaf95b52bf3b04ba6c4d587 --- /dev/null +++ b/aiter/configs/ck_tune/tuned_fmoe_ck_int8_w8a8_group.csv @@ -0,0 +1,181 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,int8_w8a8_block,torch.float16,1,256,7168,256,8,128,128,ck,1073741824,221.7238 +gfx936,int8_w8a8_block,torch.float16,2,256,7168,256,8,128,128,ck,1073741824,271.6008 +gfx936,int8_w8a8_block,torch.float16,3,256,7168,256,8,128,128,ck,256,347.8034 +gfx936,int8_w8a8_block,torch.float16,4,256,7168,256,8,128,128,ck,256,351.302 +gfx936,int8_w8a8_block,torch.float16,5,256,7168,256,8,128,128,ck,256,358.7936 +gfx936,int8_w8a8_block,torch.float16,6,256,7168,256,8,128,128,ck,256,449.8388 +gfx936,int8_w8a8_block,torch.float16,7,256,7168,256,8,128,128,ck,256,447.4445 +gfx936,int8_w8a8_block,torch.float16,8,256,7168,256,8,128,128,ck,256,454.0529 +gfx936,int8_w8a8_block,torch.float16,9,256,7168,256,8,128,128,ck,256,470.7523 +gfx936,int8_w8a8_block,torch.float16,10,256,7168,256,8,128,128,ck,256,496.542 +gfx936,int8_w8a8_block,torch.float16,11,256,7168,256,8,128,128,ck,256,496.3151 +gfx936,int8_w8a8_block,torch.float16,12,256,7168,256,8,128,128,ck,256,517.557 +gfx936,int8_w8a8_block,torch.float16,13,256,7168,256,8,128,128,ck,256,710.8428 +gfx936,int8_w8a8_block,torch.float16,14,256,7168,256,8,128,128,ck,256,719.7744 +gfx936,int8_w8a8_block,torch.float16,15,256,7168,256,8,128,128,ck,256,708.893 +gfx936,int8_w8a8_block,torch.float16,16,256,7168,256,8,128,128,ck,256,736.713 +gfx936,int8_w8a8_block,torch.float16,17,256,7168,256,8,128,128,ck,256,723.91 +gfx936,int8_w8a8_block,torch.float16,18,256,7168,256,8,128,128,ck,256,731.7118 +gfx936,int8_w8a8_block,torch.float16,19,256,7168,256,8,128,128,ck,256,921.5805 +gfx936,int8_w8a8_block,torch.float16,20,256,7168,256,8,128,128,ck,256,931.757 +gfx936,int8_w8a8_block,torch.float16,21,256,7168,256,8,128,128,ck,256,925.7805 +gfx936,int8_w8a8_block,torch.float16,22,256,7168,256,8,128,128,ck,256,933.0314 +gfx936,int8_w8a8_block,torch.float16,23,256,7168,256,8,128,128,ck,256,946.4677 +gfx936,int8_w8a8_block,torch.float16,24,256,7168,256,8,128,128,ck,256,966.1805 +gfx936,int8_w8a8_block,torch.float16,25,256,7168,256,8,128,128,ck,256,957.8053 +gfx936,int8_w8a8_block,torch.float16,26,256,7168,256,8,128,128,ck,256,994.1703 +gfx936,int8_w8a8_block,torch.float16,27,256,7168,256,8,128,128,ck,256,1027.6255 +gfx936,int8_w8a8_block,torch.float16,28,256,7168,256,8,128,128,ck,256,986.7748 +gfx936,int8_w8a8_block,torch.float16,29,256,7168,256,8,128,128,ck,256,1012.4916 +gfx936,int8_w8a8_block,torch.float16,30,256,7168,256,8,128,128,ck,256,1031.3195 +gfx936,int8_w8a8_block,torch.float16,31,256,7168,256,8,128,128,ck,256,1210.7522 +gfx936,int8_w8a8_block,torch.float16,32,256,7168,256,8,128,128,ck,256,1213.3539 +gfx936,int8_w8a8_block,torch.float16,34,256,7168,256,8,128,128,ck,256,1217.8463 +gfx936,int8_w8a8_block,torch.float16,36,256,7168,256,8,128,128,ck,256,1229.2845 +gfx936,int8_w8a8_block,torch.float16,40,256,7168,256,8,128,128,ck,256,1246.6734 +gfx936,int8_w8a8_block,torch.float16,44,256,7168,256,8,128,128,ck,256,1268.3449 +gfx936,int8_w8a8_block,torch.float16,48,256,7168,256,8,128,128,ck,256,1448.0706 +gfx936,int8_w8a8_block,torch.float16,56,256,7168,256,8,128,128,ck,256,1448.6066 +gfx936,int8_w8a8_block,torch.float16,64,256,7168,256,8,128,128,ck,256,1533.3137 +gfx936,int8_w8a8_block,torch.float16,68,256,7168,256,8,128,128,ck,256,1533.2071 +gfx936,int8_w8a8_block,torch.float16,72,256,7168,256,8,128,128,ck,256,1543.6504 +gfx936,int8_w8a8_block,torch.float16,80,256,7168,256,8,128,128,ck,256,1557.7885 +gfx936,int8_w8a8_block,torch.float16,88,256,7168,256,8,128,128,ck,256,1569.5416 +gfx936,int8_w8a8_block,torch.float16,96,256,7168,256,8,128,128,ck,256,1748.5222 +gfx936,int8_w8a8_block,torch.float16,104,256,7168,256,8,128,128,ck,256,1780.6233 +gfx936,int8_w8a8_block,torch.float16,112,256,7168,256,8,128,128,ck,256,1788.6033 +gfx936,int8_w8a8_block,torch.float16,128,256,7168,256,8,128,128,ck,256,1808.0838 +gfx936,int8_w8a8_block,torch.float16,144,256,7168,256,8,128,128,ck,256,1837.7577 +gfx936,int8_w8a8_block,torch.float16,160,256,7168,256,8,128,128,ck,256,1852.4556 +gfx936,int8_w8a8_block,torch.float16,192,256,7168,256,8,128,128,ck,256,1883.3793 +gfx936,int8_w8a8_block,torch.float16,224,256,7168,256,8,128,128,ck,256,1923.4614 +gfx936,int8_w8a8_block,torch.float16,256,256,7168,256,8,128,128,ck,256,1958.9242 +gfx936,int8_w8a8_block,torch.float16,320,256,7168,256,8,128,128,ck,256,2031.9791 +gfx936,int8_w8a8_block,torch.float16,384,256,7168,256,8,128,128,ck,256,2129.4956 +gfx936,int8_w8a8_block,torch.float16,448,256,7168,256,8,128,128,ck,256,2504.2611 +gfx936,int8_w8a8_block,torch.float16,512,256,7168,256,8,128,128,ck,256,2790.1238 +gfx936,int8_w8a8_block,torch.float16,768,256,7168,256,8,128,128,ck,1073741824,3529.9268 +gfx936,int8_w8a8_block,torch.float16,1024,256,7168,256,8,128,128,ck,256,4366.3522 +gfx936,int8_w8a8_block,torch.float16,1024,256,7168,256,8,128,128,ck,256,4391.4689 +gfx936,int8_w8a8_block,torch.float16,1152,256,7168,256,8,128,128,ck,256,4889.4998 +gfx936,int8_w8a8_block,torch.float16,1,2304,7168,256,8,128,128,ck,256,560.4577 +gfx936,int8_w8a8_block,torch.float16,2,2304,7168,256,8,128,128,ck,1073741824,922.3036 +gfx936,int8_w8a8_block,torch.float16,3,2304,7168,256,8,128,128,ck,256,1211.5382 +gfx936,int8_w8a8_block,torch.float16,4,2304,7168,256,8,128,128,ck,256,1609.7351 +gfx936,int8_w8a8_block,torch.float16,5,2304,7168,256,8,128,128,ck,256,2033.1529 +gfx936,int8_w8a8_block,torch.float16,6,2304,7168,256,8,128,128,ck,256,2095.9823 +gfx936,int8_w8a8_block,torch.float16,7,2304,7168,256,8,128,128,ck,256,2525.8506 +gfx936,int8_w8a8_block,torch.float16,8,2304,7168,256,8,128,128,ck,256,3121.4785 +gfx936,int8_w8a8_block,torch.float16,9,2304,7168,256,8,128,128,ck,256,3602.7825 +gfx936,int8_w8a8_block,torch.float16,10,2304,7168,256,8,128,128,ck,256,3449.7641 +gfx936,int8_w8a8_block,torch.float16,11,2304,7168,256,8,128,128,ck,256,4044.0698 +gfx936,int8_w8a8_block,torch.float16,12,2304,7168,256,8,128,128,ck,256,4134.7811 +gfx936,int8_w8a8_block,torch.float16,13,2304,7168,256,8,128,128,ck,256,4802.8215 +gfx936,int8_w8a8_block,torch.float16,14,2304,7168,256,8,128,128,ck,256,4719.2344 +gfx936,int8_w8a8_block,torch.float16,15,2304,7168,256,8,128,128,ck,256,4980.6316 +gfx936,int8_w8a8_block,torch.float16,16,2304,7168,256,8,128,128,ck,256,5250.5172 +gfx936,int8_w8a8_block,torch.float16,17,2304,7168,256,8,128,128,ck,256,5673.4436 +gfx936,int8_w8a8_block,torch.float16,18,2304,7168,256,8,128,128,ck,256,5719.9952 +gfx936,int8_w8a8_block,torch.float16,19,2304,7168,256,8,128,128,ck,256,6126.3857 +gfx936,int8_w8a8_block,torch.float16,20,2304,7168,256,8,128,128,ck,256,6389.1556 +gfx936,int8_w8a8_block,torch.float16,21,2304,7168,256,8,128,128,ck,256,6198.0488 +gfx936,int8_w8a8_block,torch.float16,22,2304,7168,256,8,128,128,ck,256,6704.2199 +gfx936,int8_w8a8_block,torch.float16,23,2304,7168,256,8,128,128,ck,256,6957.5245 +gfx936,int8_w8a8_block,torch.float16,24,2304,7168,256,8,128,128,ck,256,6653.9634 +gfx936,int8_w8a8_block,torch.float16,25,2304,7168,256,8,128,128,ck,256,7419.4212 +gfx936,int8_w8a8_block,torch.float16,26,2304,7168,256,8,128,128,ck,256,7740.734 +gfx936,int8_w8a8_block,torch.float16,27,2304,7168,256,8,128,128,ck,256,7583.9258 +gfx936,int8_w8a8_block,torch.float16,28,2304,7168,256,8,128,128,ck,256,8067.6719 +gfx936,int8_w8a8_block,torch.float16,29,2304,7168,256,8,128,128,ck,256,7827.3022 +gfx936,int8_w8a8_block,torch.float16,30,2304,7168,256,8,128,128,ck,256,8226.0128 +gfx936,int8_w8a8_block,torch.float16,31,2304,7168,256,8,128,128,ck,256,8765.0934 +gfx936,int8_w8a8_block,torch.float16,32,2304,7168,256,8,128,128,ck,256,8287.4442 +gfx936,int8_w8a8_block,torch.float16,34,2304,7168,256,8,128,128,ck,256,9189.8303 +gfx936,int8_w8a8_block,torch.float16,36,2304,7168,256,8,128,128,ck,256,9010.0497 +gfx936,int8_w8a8_block,torch.float16,40,2304,7168,256,8,128,128,ck,256,9991.6976 +gfx936,int8_w8a8_block,torch.float16,44,2304,7168,256,8,128,128,ck,256,10406.3407 +gfx936,int8_w8a8_block,torch.float16,48,2304,7168,256,8,128,128,ck,256,10457.3132 +gfx936,int8_w8a8_block,torch.float16,56,2304,7168,256,8,128,128,ck,256,10918.0446 +gfx936,int8_w8a8_block,torch.float16,64,2304,7168,256,8,128,128,ck,256,12263.38 +gfx936,int8_w8a8_block,torch.float16,68,2304,7168,256,8,128,128,ck,256,12388.0282 +gfx936,int8_w8a8_block,torch.float16,72,2304,7168,256,8,128,128,ck,256,12684.0274 +gfx936,int8_w8a8_block,torch.float16,80,2304,7168,256,8,128,128,ck,256,12786.4017 +gfx936,int8_w8a8_block,torch.float16,88,2304,7168,256,8,128,128,ck,256,12816.288 +gfx936,int8_w8a8_block,torch.float16,96,2304,7168,256,8,128,128,ck,256,13388.8676 +gfx936,int8_w8a8_block,torch.float16,104,2304,7168,256,8,128,128,ck,256,13664.4714 +gfx936,int8_w8a8_block,torch.float16,112,2304,7168,256,8,128,128,ck,1073741824,13843.4207 +gfx936,int8_w8a8_block,torch.float16,128,2304,7168,256,8,128,128,ck,1073741824,14062.0561 +gfx936,int8_w8a8_block,torch.float16,144,2304,7168,256,8,128,128,ck,1073741824,13992.1954 +gfx936,int8_w8a8_block,torch.float16,160,2304,7168,256,8,128,128,ck,1073741824,14250.3336 +gfx936,int8_w8a8_block,torch.float16,192,2304,7168,256,8,128,128,ck,1073741824,14385.9878 +gfx936,int8_w8a8_block,torch.float16,224,2304,7168,256,8,128,128,ck,1073741824,14440.0596 +gfx936,int8_w8a8_block,torch.float16,256,2304,7168,256,8,128,128,ck,1073741824,14530.2234 +gfx936,int8_w8a8_block,torch.float16,320,2304,7168,256,8,128,128,ck,1073741824,14700.4377 +gfx936,int8_w8a8_block,torch.float16,384,2304,7168,256,8,128,128,ck,1073741824,14808.9845 +gfx936,int8_w8a8_block,torch.float16,448,2304,7168,256,8,128,128,ck,1073741824,14983.8729 +gfx936,int8_w8a8_block,torch.float16,512,2304,7168,256,8,128,128,ck,1073741824,15144.0743 +gfx936,int8_w8a8_block,torch.float16,768,2304,7168,256,8,128,128,ck,1073741824,16370.4801 +gfx936,int8_w8a8_block,torch.float16,1024,2304,7168,256,8,128,128,ck,1073741824,22165.7068 +gfx936,int8_w8a8_block,torch.float16,1024,2304,7168,256,8,128,128,ck,1073741824,22200.5112 +gfx936,int8_w8a8_block,torch.float16,1152,2304,7168,256,8,128,128,ck,1073741824,26188.6939 +gfx936,int8_w8a8_block,torch.float16,1,1536,4096,128,8,128,128,ck,256,291.658 +gfx936,int8_w8a8_block,torch.float16,2,1536,4096,128,8,128,128,ck,256,409.0957 +gfx936,int8_w8a8_block,torch.float16,3,1536,4096,128,8,128,128,ck,256,545.9305 +gfx936,int8_w8a8_block,torch.float16,4,1536,4096,128,8,128,128,ck,256,680.0244 +gfx936,int8_w8a8_block,torch.float16,5,1536,4096,128,8,128,128,ck,256,824.0053 +gfx936,int8_w8a8_block,torch.float16,6,1536,4096,128,8,128,128,ck,256,961.9745 +gfx936,int8_w8a8_block,torch.float16,7,1536,4096,128,8,128,128,ck,256,993.5245 +gfx936,int8_w8a8_block,torch.float16,8,1536,4096,128,8,128,128,ck,256,1152.4533 +gfx936,int8_w8a8_block,torch.float16,9,1536,4096,128,8,128,128,ck,256,1160.7529 +gfx936,int8_w8a8_block,torch.float16,10,1536,4096,128,8,128,128,ck,256,1266.7067 +gfx936,int8_w8a8_block,torch.float16,11,1536,4096,128,8,128,128,ck,256,1295.2701 +gfx936,int8_w8a8_block,torch.float16,12,1536,4096,128,8,128,128,ck,256,1440.103 +gfx936,int8_w8a8_block,torch.float16,13,1536,4096,128,8,128,128,ck,256,1697.6591 +gfx936,int8_w8a8_block,torch.float16,14,1536,4096,128,8,128,128,ck,256,1745.6223 +gfx936,int8_w8a8_block,torch.float16,15,1536,4096,128,8,128,128,ck,256,1710.5023 +gfx936,int8_w8a8_block,torch.float16,16,1536,4096,128,8,128,128,ck,256,1852.1887 +gfx936,int8_w8a8_block,torch.float16,17,1536,4096,128,8,128,128,ck,256,1831.4358 +gfx936,int8_w8a8_block,torch.float16,18,1536,4096,128,8,128,128,ck,256,1838.3458 +gfx936,int8_w8a8_block,torch.float16,19,1536,4096,128,8,128,128,ck,256,2021.7323000000001 +gfx936,int8_w8a8_block,torch.float16,20,1536,4096,128,8,128,128,ck,256,2118.7607 +gfx936,int8_w8a8_block,torch.float16,21,1536,4096,128,8,128,128,ck,256,2139.1852 +gfx936,int8_w8a8_block,torch.float16,22,1536,4096,128,8,128,128,ck,256,2195.0414 +gfx936,int8_w8a8_block,torch.float16,23,1536,4096,128,8,128,128,ck,256,2192.328 +gfx936,int8_w8a8_block,torch.float16,24,1536,4096,128,8,128,128,ck,256,2440.6246 +gfx936,int8_w8a8_block,torch.float16,25,1536,4096,128,8,128,128,ck,256,2200.6248 +gfx936,int8_w8a8_block,torch.float16,26,1536,4096,128,8,128,128,ck,256,2218.4566 +gfx936,int8_w8a8_block,torch.float16,27,1536,4096,128,8,128,128,ck,256,2453.0879 +gfx936,int8_w8a8_block,torch.float16,28,1536,4096,128,8,128,128,ck,256,2350.3684 +gfx936,int8_w8a8_block,torch.float16,29,1536,4096,128,8,128,128,ck,256,2476.5557 +gfx936,int8_w8a8_block,torch.float16,30,1536,4096,128,8,128,128,ck,256,2487.2788 +gfx936,int8_w8a8_block,torch.float16,31,1536,4096,128,8,128,128,ck,256,2440.2945 +gfx936,int8_w8a8_block,torch.float16,32,1536,4096,128,8,128,128,ck,256,2470.9654 +gfx936,int8_w8a8_block,torch.float16,34,1536,4096,128,8,128,128,ck,256,2640.585 +gfx936,int8_w8a8_block,torch.float16,36,1536,4096,128,8,128,128,ck,256,2472.0041 +gfx936,int8_w8a8_block,torch.float16,40,1536,4096,128,8,128,128,ck,256,2689.1661 +gfx936,int8_w8a8_block,torch.float16,44,1536,4096,128,8,128,128,ck,256,2760.7251 +gfx936,int8_w8a8_block,torch.float16,48,1536,4096,128,8,128,128,ck,256,2789.2965 +gfx936,int8_w8a8_block,torch.float16,56,1536,4096,128,8,128,128,ck,256,2911.2359 +gfx936,int8_w8a8_block,torch.float16,64,1536,4096,128,8,128,128,ck,256,2866.2659 +gfx936,int8_w8a8_block,torch.float16,68,1536,4096,128,8,128,128,ck,256,2932.9077 +gfx936,int8_w8a8_block,torch.float16,72,1536,4096,128,8,128,128,ck,256,2849.2279 +gfx936,int8_w8a8_block,torch.float16,80,1536,4096,128,8,128,128,ck,256,2902.7918 +gfx936,int8_w8a8_block,torch.float16,88,1536,4096,128,8,128,128,ck,1073741824,2991.5017 +gfx936,int8_w8a8_block,torch.float16,96,1536,4096,128,8,128,128,ck,1073741824,3009.2322 +gfx936,int8_w8a8_block,torch.float16,104,1536,4096,128,8,128,128,ck,1073741824,3010.9726 +gfx936,int8_w8a8_block,torch.float16,112,1536,4096,128,8,128,128,ck,1073741824,3021.8461 +gfx936,int8_w8a8_block,torch.float16,128,1536,4096,128,8,128,128,ck,1073741824,3042.2445 +gfx936,int8_w8a8_block,torch.float16,144,1536,4096,128,8,128,128,ck,1073741824,3091.5212 +gfx936,int8_w8a8_block,torch.float16,160,1536,4096,128,8,128,128,ck,1073741824,3116.5366 +gfx936,int8_w8a8_block,torch.float16,192,1536,4096,128,8,128,128,ck,1073741824,3152.8336 +gfx936,int8_w8a8_block,torch.float16,224,1536,4096,128,8,128,128,ck,1073741824,3192.8889 +gfx936,int8_w8a8_block,torch.float16,256,1536,4096,128,8,128,128,ck,1073741824,3223.7425 +gfx936,int8_w8a8_block,torch.float16,320,1536,4096,128,8,128,128,ck,1073741824,3282.5084 +gfx936,int8_w8a8_block,torch.float16,384,1536,4096,128,8,128,128,ck,1073741824,3431.9543 +gfx936,int8_w8a8_block,torch.float16,448,1536,4096,128,8,128,128,ck,1073741824,3855.0492 +gfx936,int8_w8a8_block,torch.float16,512,1536,4096,128,8,128,128,ck,1073741824,4436.9874 +gfx936,int8_w8a8_block,torch.float16,768,1536,4096,128,8,128,128,ck,1073741824,5567.0914 +gfx936,int8_w8a8_block,torch.float16,1024,1536,4096,128,8,128,128,ck,1073741824,6996.0452 +gfx936,int8_w8a8_block,torch.float16,1024,1536,4096,128,8,128,128,ck,1073741824,6939.3137 +gfx936,int8_w8a8_block,torch.float16,1152,1536,4096,128,8,128,128,ck,1073741824,7784.7617 diff --git a/aiter/configs/ck_tune/untuned_fmoe_int8_w8a8_group.csv b/aiter/configs/ck_tune/untuned_fmoe_int8_w8a8_group.csv new file mode 100644 index 0000000000000000000000000000000000000000..38d68ae0fdb0cecbf434b99a50a62bbb455b7b62 --- /dev/null +++ b/aiter/configs/ck_tune/untuned_fmoe_int8_w8a8_group.csv @@ -0,0 +1,183 @@ +quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k +int8_w8a8_block,torch.float16,1,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,2,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,3,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,4,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,5,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,6,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,7,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,8,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,9,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,10,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,11,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,12,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,13,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,14,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,15,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,16,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,17,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,18,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,19,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,20,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,21,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,22,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,23,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,24,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,25,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,26,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,27,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,28,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,29,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,30,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,31,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,32,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,34,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,36,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,40,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,44,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,48,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,56,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,64,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,68,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,72,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,80,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,88,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,96,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,104,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,112,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,128,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,144,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,160,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,192,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,224,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,256,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,320,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,384,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,448,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,512,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,768,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,1024,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,1024,256,7168,256,8,128,128 +int8_w8a8_block,torch.float16,1152,256,7168,256,8,128,128 + +int8_w8a8_block,torch.float16,1,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,2,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,3,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,4,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,5,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,6,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,7,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,8,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,9,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,10,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,11,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,12,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,13,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,14,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,15,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,16,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,17,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,18,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,19,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,20,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,21,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,22,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,23,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,24,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,25,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,26,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,27,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,28,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,29,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,30,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,31,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,32,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,34,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,36,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,40,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,44,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,48,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,56,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,64,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,68,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,72,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,80,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,88,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,96,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,104,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,112,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,128,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,144,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,160,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,192,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,224,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,256,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,320,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,384,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,448,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,512,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,768,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,1024,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,1024,2304,7168,256,8,128,128 +int8_w8a8_block,torch.float16,1152,2304,7168,256,8,128,128 + +int8_w8a8_block,torch.float16,1,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,2,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,3,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,4,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,5,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,6,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,7,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,8,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,9,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,10,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,11,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,12,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,13,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,14,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,15,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,16,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,17,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,18,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,19,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,20,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,21,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,22,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,23,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,24,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,25,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,26,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,27,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,28,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,29,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,30,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,31,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,32,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,34,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,36,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,40,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,44,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,48,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,56,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,64,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,68,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,72,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,80,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,88,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,96,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,104,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,112,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,128,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,144,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,160,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,192,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,224,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,256,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,320,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,384,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,448,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,512,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,768,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,1024,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,1024,1536,4096,128,8,128,128 +int8_w8a8_block,torch.float16,1152,1536,4096,128,8,128,128 diff --git a/aiter/configs/tuned_fmoe.csv b/aiter/configs/tuned_fmoe.csv new file mode 100644 index 0000000000000000000000000000000000000000..9d62ca397c8a7d05bc60761fd9853dbade69c969 --- /dev/null +++ b/aiter/configs/tuned_fmoe.csv @@ -0,0 +1,12 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us,tag,err +256,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_128x128,1,0,32,0,270.8542,fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_32x512_pf2,1.6% +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0,64,0,346.93691000000007,fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf3,0.0% +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Token,1,0,64,0,340.65007,fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf3,0.0% +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,64,0,336.42205,fmoe_stage1_bf16_pertokenFp8_g1u1_64x256_pf3,0.0% +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.int4,QuantType.per_Tensor,1,0,128,0,699.5049699999998,ck_128,0.1% +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0,64,0,347.0561,fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf3,0.0% +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Token,1,0,64,0,334.47364,fmoe_stage1_bf16_pertokenInt8_g1u1_64x256_pf3,0.0% +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,64,0,357.0689999999998,fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf3,0.0% +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.int4,QuantType.per_Tensor,1,0,128,0,837.6014699999998,ck_128,0.0% +4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,18.364460000000005,fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3,0.0% +4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,19.47728,fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3,0.0% diff --git a/aiter/configs/tuned_fmoe_asm.csv b/aiter/configs/tuned_fmoe_asm.csv new file mode 100644 index 0000000000000000000000000000000000000000..adb197ba8552eec421eba69d3e827750883ebbe2 --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm.csv @@ -0,0 +1,243 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx938,no_quant,torch.float16,1,352,4096,128,8,0,0,asm,10008+20000,76.1135 +gfx938,no_quant,torch.float16,2,352,4096,128,8,0,0,asm,10011+20002,116.9911 +gfx938,no_quant,torch.float16,4,352,4096,128,8,0,0,asm,10002+20000,190.2544 +gfx938,no_quant,torch.float16,6,352,4096,128,8,0,0,asm,10011+20000,255.6735 +gfx938,no_quant,torch.float16,8,352,4096,128,8,0,0,asm,10002+20000,327.9644 +gfx938,no_quant,torch.float16,10,352,4096,128,8,0,0,asm,10002+20000,381.3467 +gfx938,no_quant,torch.float16,12,352,4096,128,8,0,0,asm,10002+20000,425.2932 +gfx938,no_quant,torch.float16,14,352,4096,128,8,0,0,asm,10002+20000,466.8067 +gfx938,no_quant,torch.float16,16,352,4096,128,8,0,0,asm,10002+20000,502.9911 +gfx938,no_quant,torch.float16,20,352,4096,128,8,0,0,asm,10002+20000,559.3775 +gfx938,no_quant,torch.float16,24,352,4096,128,8,0,0,asm,10002+20000,579.1763 +gfx938,no_quant,torch.float16,28,352,4096,128,8,0,0,asm,10002+20000,598.1905 +gfx938,no_quant,torch.float16,32,352,4096,128,8,0,0,asm,10002+20000,617.1032 +gfx938,no_quant,torch.float16,36,352,4096,128,8,0,0,asm,10002+20000,634.1831 +gfx938,no_quant,torch.float16,40,352,4096,128,8,0,0,asm,10002+20000,650.6875 +gfx938,no_quant,torch.float16,44,352,4096,128,8,0,0,asm,10002+20000,673.797 +gfx938,no_quant,torch.float16,48,352,4096,128,8,0,0,asm,10002+20000,688.9969 +gfx938,no_quant,torch.float16,56,352,4096,128,8,0,0,asm,10002+20000,690.6337 +gfx938,no_quant,torch.float16,64,352,4096,128,8,0,0,asm,10002+20000,705.25 +gfx938,no_quant,torch.float16,68,352,4096,128,8,0,0,asm,10002+20000,692.6071 +gfx938,no_quant,torch.float16,72,352,4096,128,8,0,0,asm,10002+20000,693.4728 +gfx938,no_quant,torch.float16,80,352,4096,128,8,0,0,asm,10002+20000,693.0667 +gfx938,no_quant,torch.float16,88,352,4096,128,8,0,0,asm,10002+20000,698.6174 +gfx938,no_quant,torch.float16,96,352,4096,128,8,0,0,asm,10002+20000,701.0585 +gfx938,no_quant,torch.float16,104,352,4096,128,8,0,0,asm,10002+20000,700.9929 +gfx938,no_quant,torch.float16,112,352,4096,128,8,0,0,asm,10002+20000,702.7078 +gfx938,no_quant,torch.float16,128,352,4096,128,8,0,0,asm,10002+20000,706.9581 +gfx938,no_quant,torch.float16,144,352,4096,128,8,0,0,asm,10002+20000,729.0544 +gfx938,no_quant,torch.float16,160,352,4096,128,8,0,0,asm,10002+20000,722.2728 +gfx938,no_quant,torch.float16,192,352,4096,128,8,0,0,asm,10011+20000,742.4902 +gfx938,no_quant,torch.float16,224,352,4096,128,8,0,0,asm,11004+21001,774.5392 +gfx938,no_quant,torch.float16,256,352,4096,128,8,0,0,asm,11004+21001,780.4756 +gfx938,no_quant,torch.float16,320,352,4096,128,8,0,0,asm,11004+21001,796.5904 +gfx938,no_quant,torch.float16,384,352,4096,128,8,0,0,asm,11005+21001,804.2211 +gfx938,no_quant,torch.float16,448,352,4096,128,8,0,0,asm,12005+22001,833.0702 +gfx938,no_quant,torch.float16,512,352,4096,128,8,0,0,asm,12005+22001,847.1173 +gfx938,no_quant,torch.float16,576,352,4096,128,8,0,0,asm,12005+22001,857.4557 +gfx938,no_quant,torch.float16,640,352,4096,128,8,0,0,asm,12005+22001,867.1542 +gfx938,no_quant,torch.float16,704,352,4096,128,8,0,0,asm,12005+22001,878.2925 +gfx938,no_quant,torch.float16,768,352,4096,128,8,0,0,asm,12005+22001,890.8995 +gfx938,no_quant,torch.float16,832,352,4096,128,8,0,0,asm,12005+22001,900.5198 +gfx938,no_quant,torch.float16,896,352,4096,128,8,0,0,asm,12001+22001,958.5009 +gfx938,no_quant,torch.float16,960,352,4096,128,8,0,0,asm,12003+22001,985.6514 +gfx938,no_quant,torch.float16,1024,352,4096,128,8,0,0,asm,13001+23001,1062.3093 +gfx938,no_quant,torch.float16,1152,352,4096,128,8,0,0,asm,13001+23001,1079.7408 +gfx938,no_quant,torch.float16,1280,352,4096,128,8,0,0,asm,13001+23001,1090.4801 +gfx938,no_quant,torch.float16,1408,352,4096,128,8,0,0,asm,13001+23001,1110.9976 +gfx938,no_quant,torch.float16,1536,352,4096,128,8,0,0,asm,13001+23001,1123.6784 +gfx938,no_quant,torch.float16,1664,352,4096,128,8,0,0,asm,13001+23001,1141.3393 +gfx938,no_quant,torch.float16,1792,352,4096,128,8,0,0,asm,13001+23001,1163.7203 +gfx938,no_quant,torch.float16,1920,352,4096,128,8,0,0,asm,13001+23001,1304.7715 +gfx938,no_quant,torch.float16,2048,352,4096,128,8,0,0,asm,13001+23001,1584.7923 +gfx938,no_quant,torch.float16,2304,352,4096,128,8,0,0,asm,13001+23001,1943.0033 +gfx938,no_quant,torch.float16,2560,352,4096,128,8,0,0,asm,13001+23001,1988.7219 +gfx938,no_quant,torch.float16,2816,352,4096,128,8,0,0,asm,13001+23001,2026.4655 +gfx938,no_quant,torch.float16,3072,352,4096,128,8,0,0,asm,13001+23001,2056.3481 +gfx938,no_quant,torch.float16,3328,352,4096,128,8,0,0,asm,13001+23001,2081.0787 +gfx938,no_quant,torch.float16,3584,352,4096,128,8,0,0,asm,13001+23001,2113.5351 +gfx938,no_quant,torch.float16,3840,352,4096,128,8,0,0,asm,13001+23001,2251.9895 +gfx938,no_quant,torch.float16,4096,352,4096,128,8,0,0,asm,13001+23001,2568.3675 +gfx938,no_quant,torch.float16,5120,352,4096,128,8,0,0,asm,13001+23001,3066.6487 +gfx938,no_quant,torch.float16,6144,352,4096,128,8,0,0,asm,13001+23001,3581.1465 +gfx938,no_quant,torch.float16,7168,352,4096,128,8,0,0,asm,13001+23001,4122.1716 +gfx938,no_quant,torch.float16,7680,352,4096,128,8,0,0,asm,13001+23001,4222.0785 +gfx938,no_quant,torch.float16,8192,352,4096,128,8,0,0,asm,13001+23001,4607.7508 +gfx938,no_quant,torch.float16,10240,352,4096,128,8,0,0,asm,13001+23001,5676.0447 +gfx938,no_quant,torch.float16,12288,352,4096,128,8,0,0,asm,13001+23001,6679.6272 +gfx938,no_quant,torch.float16,14336,352,4096,128,8,0,0,asm,13001+23001,7789.8272 +gfx938,no_quant,torch.float16,16384,352,4096,128,8,0,0,asm,13001+23001,8726.181 +gfx938,no_quant,torch.float16,17408,352,4096,128,8,0,0,asm,13001+23001,9263.6738 +gfx938,no_quant,torch.float16,24576,352,4096,128,8,0,0,asm,13001+23001,12830.282 +gfx938,no_quant,torch.float16,32768,352,4096,128,8,0,0,asm,13001+23001,17031.4351 +gfx938,no_quant,torch.float16,40960,352,4096,128,8,0,0,asm,13001+23001,21120.0199 +gfx938,no_quant,torch.float16,49152,352,4096,128,8,0,0,asm,13001+23001,25184.3324 +gfx938,no_quant,torch.float16,57344,352,4096,128,8,0,0,asm,13001+23001,29341.3454 +gfx938,no_quant,torch.float16,65536,352,4096,128,8,0,0,asm,13001+23001,33437.6425 +gfx938,no_quant,torch.float16,1,352,4096,129,9,0,0,asm,10008+20000,79.718 +gfx938,no_quant,torch.float16,2,352,4096,129,9,0,0,asm,10011+20002,118.7885 +gfx938,no_quant,torch.float16,4,352,4096,129,9,0,0,asm,10002+20000,196.5523 +gfx938,no_quant,torch.float16,6,352,4096,129,9,0,0,asm,10011+20000,270.1234 +gfx938,no_quant,torch.float16,8,352,4096,129,9,0,0,asm,10002+20000,322.2257 +gfx938,no_quant,torch.float16,10,352,4096,129,9,0,0,asm,10002+20000,379.8174 +gfx938,no_quant,torch.float16,12,352,4096,129,9,0,0,asm,10002+20000,416.2644 +gfx938,no_quant,torch.float16,14,352,4096,129,9,0,0,asm,10002+20000,457.8191 +gfx938,no_quant,torch.float16,16,352,4096,129,9,0,0,asm,10002+20000,493.3677 +gfx938,no_quant,torch.float16,20,352,4096,129,9,0,0,asm,10002+20000,562.0455 +gfx938,no_quant,torch.float16,24,352,4096,129,9,0,0,asm,10002+20000,609.2494 +gfx938,no_quant,torch.float16,28,352,4096,129,9,0,0,asm,10002+20000,639.2872 +gfx938,no_quant,torch.float16,32,352,4096,129,9,0,0,asm,10002+20000,650.7579 +gfx938,no_quant,torch.float16,36,352,4096,129,9,0,0,asm,10002+20000,680.4902 +gfx938,no_quant,torch.float16,40,352,4096,129,9,0,0,asm,10002+20000,686.082 +gfx938,no_quant,torch.float16,48,352,4096,129,9,0,0,asm,10002+20000,712.4491 +gfx938,no_quant,torch.float16,56,352,4096,129,9,0,0,asm,10002+20000,710.9639 +gfx938,no_quant,torch.float16,64,352,4096,129,9,0,0,asm,10002+20000,730.5607 +gfx938,no_quant,torch.float16,72,352,4096,129,9,0,0,asm,10002+20000,718.7957 +gfx938,no_quant,torch.float16,80,352,4096,129,9,0,0,asm,10002+20000,720.0142 +gfx938,no_quant,torch.float16,88,352,4096,129,9,0,0,asm,10002+20000,722.9229 +gfx938,no_quant,torch.float16,96,352,4096,129,9,0,0,asm,10002+20000,723.0624 +gfx938,no_quant,torch.float16,104,352,4096,129,9,0,0,asm,10002+20000,724.0839 +gfx938,no_quant,torch.float16,112,352,4096,129,9,0,0,asm,10002+20000,735.3413 +gfx938,no_quant,torch.float16,128,352,4096,129,9,0,0,asm,10002+20000,737.2243 +gfx938,no_quant,torch.float16,144,352,4096,129,9,0,0,asm,10002+20000,754.1186 +gfx938,no_quant,torch.float16,160,352,4096,129,9,0,0,asm,10002+20000,753.3679 +gfx938,no_quant,torch.float16,192,352,4096,129,9,0,0,asm,10002+20000,788.8426 +gfx938,no_quant,torch.float16,224,352,4096,129,9,0,0,asm,11004+21001,801.5606 +gfx938,no_quant,torch.float16,256,352,4096,129,9,0,0,asm,11004+21001,807.7226 +gfx938,no_quant,torch.float16,320,352,4096,129,9,0,0,asm,11004+21001,826.3646 +gfx938,no_quant,torch.float16,384,352,4096,129,9,0,0,asm,12005+22001,854.2537 +gfx938,no_quant,torch.float16,448,352,4096,129,9,0,0,asm,12005+22001,864.8998 +gfx938,no_quant,torch.float16,512,352,4096,129,9,0,0,asm,12005+22001,883.4228 +gfx938,no_quant,torch.float16,576,352,4096,129,9,0,0,asm,12005+22001,895.6977 +gfx938,no_quant,torch.float16,640,352,4096,129,9,0,0,asm,12001+22001,908.3787 +gfx938,no_quant,torch.float16,768,352,4096,129,9,0,0,asm,12001+22001,915.4966 +gfx938,no_quant,torch.float16,960,352,4096,129,9,0,0,asm,13001+23001,1079.7907 +gfx938,no_quant,torch.float16,1024,352,4096,129,9,0,0,asm,13001+23001,1088.9094 +gfx938,no_quant,torch.float16,1280,352,4096,129,9,0,0,asm,13001+23001,1129.3092 +gfx938,no_quant,torch.float16,1536,352,4096,129,9,0,0,asm,13001+23001,1169.2723 +gfx938,no_quant,torch.float16,1920,352,4096,129,9,0,0,asm,13001+23001,1764.185 +gfx938,no_quant,torch.float16,2048,352,4096,129,9,0,0,asm,13001+23001,1934.3223 +gfx938,no_quant,torch.float16,2304,352,4096,129,9,0,0,asm,13001+23001,2006.923 +gfx938,no_quant,torch.float16,2560,352,4096,129,9,0,0,asm,13001+23001,2035.540 +gfx938,no_quant,torch.float16,2816,352,4096,129,9,0,0,asm,13001+23001,2068.5062 +gfx938,no_quant,torch.float16,3072,352,4096,129,9,0,0,asm,13001+23001,2102.0827 +gfx938,no_quant,torch.float16,3584,352,4096,129,9,0,0,asm,13001+23001,2447.431 +gfx938,no_quant,torch.float16,3840,352,4096,129,9,0,0,asm,13001+23001,2805.2093 +gfx938,no_quant,torch.float16,4096,352,4096,129,9,0,0,asm,13001+23001,3007.0664 +gfx938,no_quant,torch.float16,4608,352,4096,129,9,0,0,asm,13001+23001,3123.0345 +gfx938,no_quant,torch.float16,5120,352,4096,129,9,0,0,asm,13001+23001,3246.8705 +gfx938,no_quant,torch.float16,5632,352,4096,129,9,0,0,asm,13001+23001,3764.9074 +gfx938,no_quant,torch.float16,6144,352,4096,129,9,0,0,asm,13001+23001,4092.9607 +gfx938,no_quant,torch.float16,6656,352,4096,129,9,0,0,asm,13001+23001,4175.6811 +gfx938,no_quant,torch.float16,7168,352,4096,129,9,0,0,asm,13001+23001,4504.6947 +gfx938,no_quant,torch.float16,7680,352,4096,129,9,0,0,asm,13001+23001,5027.1321 +gfx938,no_quant,torch.float16,8192,352,4096,129,9,0,0,asm,13001+23001,5169.6112 +gfx938,no_quant,torch.float16,10240,352,4096,129,9,0,0,asm,13001+23001,6238.8422 +gfx938,no_quant,torch.float16,12288,352,4096,129,9,0,0,asm,13001+23001,7419.1773 +gfx938,no_quant,torch.float16,16384,352,4096,129,9,0,0,asm,13001+23001,9753.8878 +gfx938,no_quant,torch.float16,24576,352,4096,129,9,0,0,asm,13001+23001,14398.8284 +gfx938,no_quant,torch.float16,32768,352,4096,129,9,0,0,asm,13001+23001,19058.7232 +gfx938,no_quant,torch.float16,40960,352,4096,129,9,0,0,asm,13001+23001,23722.1115 +gfx938,no_quant,torch.float16,49152,352,4096,129,9,0,0,asm,13001+23001,28329.0767 +gfx938,no_quant,torch.float16,65536,352,4096,129,9,0,0,asm,13001+23001,37562.0269 +gfx936,no_quant,torch.float16,1,256,3072,256,8,0,0,asm,10002+20000,55.456 +gfx936,no_quant,torch.float16,2,256,3072,256,8,0,0,asm,10002+20000,86.6223 +gfx936,no_quant,torch.float16,4,256,3072,256,8,0,0,asm,10002+20000,155.6412 +gfx936,no_quant,torch.float16,6,256,3072,256,8,0,0,asm,10001+20000,212.4495 +gfx936,no_quant,torch.float16,8,256,3072,256,8,0,0,asm,10001+20000,258.5125 +gfx936,no_quant,torch.float16,12,256,3072,256,8,0,0,asm,10001+20000,349.5272 +gfx936,no_quant,torch.float16,16,256,3072,256,8,0,0,asm,10001+20000,424.9797 +gfx936,no_quant,torch.float16,24,256,3072,256,8,0,0,asm,10001+20000,569.7428 +gfx936,no_quant,torch.float16,32,256,3072,256,8,0,0,asm,10001+20000,698.3014 +gfx936,no_quant,torch.float16,36,256,3072,256,8,0,0,asm,10001+20000,731.8899 +gfx936,no_quant,torch.float16,48,256,3072,256,8,0,0,asm,10001+20000,823.0646 +gfx936,no_quant,torch.float16,56,256,3072,256,8,0,0,asm,10001+20000,869.7846 +gfx936,no_quant,torch.float16,64,256,3072,256,8,0,0,asm,10001+20000,902.1634 +gfx936,no_quant,torch.float16,72,256,3072,256,8,0,0,asm,10001+20000,939.9486 +gfx936,no_quant,torch.float16,80,256,3072,256,8,0,0,asm,10001+20000,966.9634 +gfx936,no_quant,torch.float16,88,256,3072,256,8,0,0,asm,10001+20000,994.9044 +gfx936,no_quant,torch.float16,96,256,3072,256,8,0,0,asm,10001+20000,1011.3254 +gfx936,no_quant,torch.float16,100,256,3072,256,8,0,0,asm,10001+20000,1012.3611 +gfx936,no_quant,torch.float16,112,256,3072,256,8,0,0,asm,10001+20000,1031.8055 +gfx936,no_quant,torch.float16,128,256,3072,256,8,0,0,asm,10001+20000,1058.5927 +gfx936,no_quant,torch.float16,144,256,3072,256,8,0,0,asm,10001+20000,1068.6726 +gfx936,no_quant,torch.float16,160,256,3072,256,8,0,0,asm,10001+20000,1079.586 +gfx936,no_quant,torch.float16,192,256,3072,256,8,0,0,asm,10001+20000,1096.3268 +gfx936,no_quant,torch.float16,224,256,3072,256,8,0,0,asm,10001+20000,1111.8469 +gfx936,no_quant,torch.float16,256,256,3072,256,8,0,0,asm,10001+20000,1125.3208 +gfx936,no_quant,torch.float16,320,256,3072,256,8,0,0,asm,10001+20000,1151.3166 +gfx936,no_quant,torch.float16,384,256,3072,256,8,0,0,asm,10002+20000,1176.2598 +gfx936,no_quant,torch.float16,448,256,3072,256,8,0,0,asm,10002+20000,1216.6051 +gfx936,no_quant,torch.float16,512,256,3072,256,8,0,0,asm,12005+22001,1234.8115 +gfx936,no_quant,torch.float16,640,256,3072,256,8,0,0,asm,12005+22001,1258.5252 +gfx936,no_quant,torch.float16,768,256,3072,256,8,0,0,asm,12005+22001,1282.3315 +gfx936,no_quant,torch.float16,896,256,3072,256,8,0,0,asm,12005+22001,1306.7272 +gfx936,no_quant,torch.float16,1024,256,3072,256,8,0,0,asm,12005+22001,1334.2219 +gfx936,no_quant,torch.float16,1280,256,3072,256,8,0,0,asm,12005+22001,1376.8577 +gfx936,no_quant,torch.float16,1536,256,3072,256,8,0,0,asm,12005+22001,1437.6745 +gfx936,no_quant,torch.float16,2048,256,3072,256,8,0,0,asm,13001+23001,1541.6658 +gfx936,no_quant,torch.float16,2304,256,3072,256,8,0,0,asm,13001+23001,1576.0315 +gfx936,no_quant,torch.float16,2560,256,3072,256,8,0,0,asm,13001+23001,1624.385 +gfx936,no_quant,torch.float16,3072,256,3072,256,8,0,0,asm,13001+23001,1700.9494 +gfx936,no_quant,torch.float16,3584,256,3072,256,8,0,0,asm,13001+23001,1818.2295 +gfx936,no_quant,torch.float16,4096,256,3072,256,8,0,0,asm,13001+23001,1965.16 +gfx936,no_quant,torch.float16,5120,256,3072,256,8,0,0,asm,13001+23001,2322.7428 +gfx936,no_quant,torch.float16,6144,256,3072,256,8,0,0,asm,13001+23001,2458.768 +gfx936,no_quant,torch.float16,7168,256,3072,256,8,0,0,asm,13001+23001,2695.1215 +gfx936,no_quant,torch.float16,8192,256,3072,256,8,0,0,asm,13001+23001,3011.6433 +gfx936,no_quant,torch.float16,10240,256,3072,256,8,0,0,asm,13001+23001,3578.2532 +gfx936,no_quant,torch.float16,12288,256,3072,256,8,0,0,asm,13001+23001,4153.0568 +gfx936,no_quant,torch.float16,16384,256,3072,256,8,0,0,asm,13001+23001,5263.1272 +gfx936,no_quant,torch.float16,24576,256,3072,256,8,0,0,asm,13001+23001,7644.2697 +gfx936,no_quant,torch.float16,32768,256,3072,256,8,0,0,asm,13001+23001,10052.4186 +gfx936,no_quant,torch.float16,1,128,3072,256,8,0,0,asm,10000+20000,41.3928 +gfx936,no_quant,torch.float16,2,128,3072,256,8,0,0,asm,10002+20000,57.1149 +gfx936,no_quant,torch.float16,4,128,3072,256,8,0,0,asm,10002+20002,89.8055 +gfx936,no_quant,torch.float16,6,128,3072,256,8,0,0,asm,10002+20002,119.0938 +gfx936,no_quant,torch.float16,8,128,3072,256,8,0,0,asm,10002+20000,144.6431 +gfx936,no_quant,torch.float16,12,128,3072,256,8,0,0,asm,10002+20000,194.622 +gfx936,no_quant,torch.float16,16,128,3072,256,8,0,0,asm,10001+20000,235.6157 +gfx936,no_quant,torch.float16,24,128,3072,256,8,0,0,asm,10001+20000,306.181 +gfx936,no_quant,torch.float16,32,128,3072,256,8,0,0,asm,10001+20002,378.7229 +gfx936,no_quant,torch.float16,36,128,3072,256,8,0,0,asm,10001+20002,388.2777 +gfx936,no_quant,torch.float16,48,128,3072,256,8,0,0,asm,10001+20000,433.7514 +gfx936,no_quant,torch.float16,56,128,3072,256,8,0,0,asm,10001+20000,457.8524 +gfx936,no_quant,torch.float16,64,128,3072,256,8,0,0,asm,10001+20002,475.2419 +gfx936,no_quant,torch.float16,72,128,3072,256,8,0,0,asm,10001+20002,493.6672 +gfx936,no_quant,torch.float16,80,128,3072,256,8,0,0,asm,10001+20002,507.8987 +gfx936,no_quant,torch.float16,88,128,3072,256,8,0,0,asm,10001+20002,524.2018 +gfx936,no_quant,torch.float16,96,128,3072,256,8,0,0,asm,10001+20002,531.7892 +gfx936,no_quant,torch.float16,100,128,3072,256,8,0,0,asm,10001+20002,532.3534 +gfx936,no_quant,torch.float16,112,128,3072,256,8,0,0,asm,10001+20002,543.4018 +gfx936,no_quant,torch.float16,128,128,3072,256,8,0,0,asm,10001+20002,556.3197 +gfx936,no_quant,torch.float16,144,128,3072,256,8,0,0,asm,10001+20002,562.2986 +gfx936,no_quant,torch.float16,160,128,3072,256,8,0,0,asm,10001+20000,569.8103 +gfx936,no_quant,torch.float16,192,128,3072,256,8,0,0,asm,10001+20002,577.9871 +gfx936,no_quant,torch.float16,224,128,3072,256,8,0,0,asm,10001+20002,588.0249 +gfx936,no_quant,torch.float16,256,128,3072,256,8,0,0,asm,10001+20002,595.6291 +gfx936,no_quant,torch.float16,320,128,3072,256,8,0,0,asm,10001+20002,610.5007 +gfx936,no_quant,torch.float16,384,128,3072,256,8,0,0,asm,10002+20002,625.9365 +gfx936,no_quant,torch.float16,448,128,3072,256,8,0,0,asm,11007+21001,636.3196 +gfx936,no_quant,torch.float16,512,128,3072,256,8,0,0,asm,11007+21001,649.5155 +gfx936,no_quant,torch.float16,640,128,3072,256,8,0,0,asm,11004+21001,690.1469 +gfx936,no_quant,torch.float16,768,128,3072,256,8,0,0,asm,11006+21001,706.9048 +gfx936,no_quant,torch.float16,896,128,3072,256,8,0,0,asm,11007+21001,707.7806 +gfx936,no_quant,torch.float16,1024,128,3072,256,8,0,0,asm,11005+21001,737.069 +gfx936,no_quant,torch.float16,1280,128,3072,256,8,0,0,asm,12005+22001,785.271 +gfx936,no_quant,torch.float16,1536,128,3072,256,8,0,0,asm,12004+22001,845.0015 +gfx936,no_quant,torch.float16,2048,128,3072,256,8,0,0,asm,12001+22001,926.7277 +gfx936,no_quant,torch.float16,2304,128,3072,256,8,0,0,asm,13001+23001,967.7214 +gfx936,no_quant,torch.float16,2560,128,3072,256,8,0,0,asm,13001+23001,993.0266 +gfx936,no_quant,torch.float16,3072,128,3072,256,8,0,0,asm,13001+23001,1061.8433 +gfx936,no_quant,torch.float16,3584,128,3072,256,8,0,0,asm,13001+23001,1178.6685 +gfx936,no_quant,torch.float16,4096,128,3072,256,8,0,0,asm,13001+23001,1286.8873 +gfx936,no_quant,torch.float16,5120,128,3072,256,8,0,0,asm,13001+23001,1546.3902 +gfx936,no_quant,torch.float16,6144,128,3072,256,8,0,0,asm,13001+23001,1648.3774 +gfx936,no_quant,torch.float16,7168,128,3072,256,8,0,0,asm,13001+23001,1815.4257 +gfx936,no_quant,torch.float16,8192,128,3072,256,8,0,0,asm,13001+23001,2049.7412 +gfx936,no_quant,torch.float16,10240,128,3072,256,8,0,0,asm,13001+23001,2439.6861 +gfx936,no_quant,torch.float16,12288,128,3072,256,8,0,0,asm,13001+23001,2854.5487 +gfx936,no_quant,torch.float16,16384,128,3072,256,8,0,0,asm,13001+23001,3669.7898 +gfx936,no_quant,torch.float16,24576,128,3072,256,8,0,0,asm,13001+23001,5322.7565 +gfx936,no_quant,torch.float16,32768,128,3072,256,8,0,0,asm,13001+23001,7028.0263 diff --git a/aiter/configs/tuned_fmoe_asm_shuffle.csv b/aiter/configs/tuned_fmoe_asm_shuffle.csv new file mode 100644 index 0000000000000000000000000000000000000000..3f4ff50797c417d71752f67b41bf379b2fe72a32 --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm_shuffle.csv @@ -0,0 +1,223 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx938,no_quant,torch.float16,1,352,4096,128,8,0,0,asm,10009+20000,78.6408 +gfx938,no_quant,torch.float16,2,352,4096,128,8,0,0,asm,10004+20002,117.1482 +gfx938,no_quant,torch.float16,4,352,4096,128,8,0,0,asm,10006+20002,181.0486 +gfx938,no_quant,torch.float16,6,352,4096,128,8,0,0,asm,10006+20000,236.5396 +gfx938,no_quant,torch.float16,8,352,4096,128,8,0,0,asm,10007+20000,291.5917 +gfx938,no_quant,torch.float16,10,352,4096,128,8,0,0,asm,10006+20000,336.7976 +gfx938,no_quant,torch.float16,12,352,4096,128,8,0,0,asm,10006+20000,370.9924 +gfx938,no_quant,torch.float16,16,352,4096,128,8,0,0,asm,10006+20000,436.4362 +gfx938,no_quant,torch.float16,20,352,4096,128,8,0,0,asm,10006+20000,490.3447 +gfx938,no_quant,torch.float16,24,352,4096,128,8,0,0,asm,10006+20000,508.1621 +gfx938,no_quant,torch.float16,32,352,4096,128,8,0,0,asm,10007+20000,537.4327 +gfx938,no_quant,torch.float16,36,352,4096,128,8,0,0,asm,10007+20000,556.2111 +gfx938,no_quant,torch.float16,40,352,4096,128,8,0,0,asm,10006+20000,572.0756 +gfx938,no_quant,torch.float16,48,352,4096,128,8,0,0,asm,10006+20000,606.7093 +gfx938,no_quant,torch.float16,56,352,4096,128,8,0,0,asm,10006+20000,608.8057 +gfx938,no_quant,torch.float16,64,352,4096,128,8,0,0,asm,10007+20000,618.06 +gfx938,no_quant,torch.float16,72,352,4096,128,8,0,0,asm,10006+20000,612.0755 +gfx938,no_quant,torch.float16,80,352,4096,128,8,0,0,asm,10006+20000,615.2016 +gfx938,no_quant,torch.float16,88,352,4096,128,8,0,0,asm,10006+20000,620.6252 +gfx938,no_quant,torch.float16,96,352,4096,128,8,0,0,asm,10006+20000,622.9226 +gfx938,no_quant,torch.float16,112,352,4096,128,8,0,0,asm,10006+20000,625.9872 +gfx938,no_quant,torch.float16,128,352,4096,128,8,0,0,asm,10007+20000,631.9154 +gfx938,no_quant,torch.float16,144,352,4096,128,8,0,0,asm,10006+20000,648.1697 +gfx938,no_quant,torch.float16,160,352,4096,128,8,0,0,asm,10006+20000,648.3789 +gfx938,no_quant,torch.float16,192,352,4096,128,8,0,0,asm,10007+20000,667.0865 +gfx938,no_quant,torch.float16,224,352,4096,128,8,0,0,asm,11004+21001,679.0988 +gfx938,no_quant,torch.float16,256,352,4096,128,8,0,0,asm,11002+21001,691.788 +gfx938,no_quant,torch.float16,320,352,4096,128,8,0,0,asm,11004+21001,704.5919 +gfx938,no_quant,torch.float16,384,352,4096,128,8,0,0,asm,11004+21001,724.2637 +gfx938,no_quant,torch.float16,512,352,4096,128,8,0,0,asm,12000+22001,774.9958 +gfx938,no_quant,torch.float16,640,352,4096,128,8,0,0,asm,12000+22001,799.6931 +gfx938,no_quant,torch.float16,768,352,4096,128,8,0,0,asm,12004+22001,822.5894 +gfx938,no_quant,torch.float16,960,352,4096,128,8,0,0,asm,12005+22001,900.0783 +gfx938,no_quant,torch.float16,1024,352,4096,128,8,0,0,asm,12005+22001,947.0566 +gfx938,no_quant,torch.float16,1152,352,4096,128,8,0,0,asm,13001+23001,1047.4336 +gfx938,no_quant,torch.float16,1280,352,4096,128,8,0,0,asm,13001+23001,1077.3083 +gfx938,no_quant,torch.float16,1408,352,4096,128,8,0,0,asm,13001+23001,1088.9267 +gfx938,no_quant,torch.float16,1536,352,4096,128,8,0,0,asm,13001+23001,1121.5625 +gfx938,no_quant,torch.float16,2048,352,4096,128,8,0,0,asm,13001+23001,1410.5335 +gfx938,no_quant,torch.float16,2304,352,4096,128,8,0,0,asm,12001+22001,1612.1491 +gfx938,no_quant,torch.float16,2560,352,4096,128,8,0,0,asm,12001+22001,1651.5132 +gfx938,no_quant,torch.float16,2816,352,4096,128,8,0,0,asm,12001+22001,1739.7137 +gfx938,no_quant,torch.float16,3072,352,4096,128,8,0,0,asm,13001+23001,1789.1421 +gfx938,no_quant,torch.float16,3840,352,4096,128,8,0,0,asm,13001+23001,1984.0983 +gfx938,no_quant,torch.float16,4096,352,4096,128,8,0,0,asm,13001+23001,2248.1545 +gfx938,no_quant,torch.float16,5120,352,4096,128,8,0,0,asm,13001+23001,2635.2017 +gfx938,no_quant,torch.float16,6144,352,4096,128,8,0,0,asm,13001+23001,3099.4145 +gfx938,no_quant,torch.float16,7168,352,4096,128,8,0,0,asm,13001+23001,3564.3297 +gfx938,no_quant,torch.float16,7680,352,4096,128,8,0,0,asm,13001+23001,3648.124 +gfx938,no_quant,torch.float16,8192,352,4096,128,8,0,0,asm,13001+23001,3963.4827 +gfx938,no_quant,torch.float16,12288,352,4096,128,8,0,0,asm,13001+23001,5748.9863 +gfx938,no_quant,torch.float16,16384,352,4096,128,8,0,0,asm,13001+23001,7546.4485 +gfx938,no_quant,torch.float16,24576,352,4096,128,8,0,0,asm,13001+23001,11088.5202 +gfx938,no_quant,torch.float16,32768,352,4096,128,8,0,0,asm,13001+23001,14713.3474 +gfx938,no_quant,torch.float16,49152,352,4096,128,8,0,0,asm,13001+23001,21815.9769 +gfx938,no_quant,torch.float16,65536,352,4096,128,8,0,0,asm,13001+23001,29012.8921 +gfx938,no_quant,torch.float16,1,352,4096,129,9,0,0,asm,10004+20000,82.8296 +gfx938,no_quant,torch.float16,2,352,4096,129,9,0,0,asm,10004+20002,117.517 +gfx938,no_quant,torch.float16,4,352,4096,129,9,0,0,asm,10006+20002,183.4528 +gfx938,no_quant,torch.float16,6,352,4096,129,9,0,0,asm,10006+20000,241.0813 +gfx938,no_quant,torch.float16,8,352,4096,129,9,0,0,asm,10007+20000,279.1283 +gfx938,no_quant,torch.float16,10,352,4096,129,9,0,0,asm,10006+20000,330.1271 +gfx938,no_quant,torch.float16,12,352,4096,129,9,0,0,asm,10006+20000,359.8869 +gfx938,no_quant,torch.float16,14,352,4096,129,9,0,0,asm,10006+20000,395.9155 +gfx938,no_quant,torch.float16,16,352,4096,129,9,0,0,asm,10006+20000,424.8344 +gfx938,no_quant,torch.float16,20,352,4096,129,9,0,0,asm,10007+20000,483.2763 +gfx938,no_quant,torch.float16,24,352,4096,129,9,0,0,asm,10006+20000,521.8279 +gfx938,no_quant,torch.float16,28,352,4096,129,9,0,0,asm,10006+20000,548.0873 +gfx938,no_quant,torch.float16,32,352,4096,129,9,0,0,asm,10006+20000,558.2041 +gfx938,no_quant,torch.float16,36,352,4096,129,9,0,0,asm,10006+20000,586.8205 +gfx938,no_quant,torch.float16,40,352,4096,129,9,0,0,asm,10006+20000,588.6543 +gfx938,no_quant,torch.float16,48,352,4096,129,9,0,0,asm,10006+20000,611.8296 +gfx938,no_quant,torch.float16,56,352,4096,129,9,0,0,asm,10006+20000,614.9763 +gfx938,no_quant,torch.float16,64,352,4096,129,9,0,0,asm,10006+20000,632.5423 +gfx938,no_quant,torch.float16,72,352,4096,129,9,0,0,asm,10006+20000,624.0347 +gfx938,no_quant,torch.float16,80,352,4096,129,9,0,0,asm,10006+20000,629.3024 +gfx938,no_quant,torch.float16,88,352,4096,129,9,0,0,asm,10006+20000,631.3577 +gfx938,no_quant,torch.float16,96,352,4096,129,9,0,0,asm,10006+20000,630.9926 +gfx938,no_quant,torch.float16,104,352,4096,129,9,0,0,asm,10006+20000,633.9628 +gfx938,no_quant,torch.float16,112,352,4096,129,9,0,0,asm,10007+20000,644.289 +gfx938,no_quant,torch.float16,128,352,4096,129,9,0,0,asm,10007+20000,647.6202 +gfx938,no_quant,torch.float16,144,352,4096,129,9,0,0,asm,10006+20000,659.6407 +gfx938,no_quant,torch.float16,160,352,4096,129,9,0,0,asm,10006+20000,665.6427 +gfx938,no_quant,torch.float16,192,352,4096,129,9,0,0,asm,11004+21001,682.6765 +gfx938,no_quant,torch.float16,224,352,4096,129,9,0,0,asm,11004+21001,688.2437 +gfx938,no_quant,torch.float16,256,352,4096,129,9,0,0,asm,11004+21001,694.6395 +gfx938,no_quant,torch.float16,320,352,4096,129,9,0,0,asm,11002+21001,719.4435 +gfx938,no_quant,torch.float16,384,352,4096,129,9,0,0,asm,11004+21001,752.6085 +gfx938,no_quant,torch.float16,448,352,4096,129,9,0,0,asm,12000+22001,773.6054 +gfx938,no_quant,torch.float16,512,352,4096,129,9,0,0,asm,12000+22001,791.353 +gfx938,no_quant,torch.float16,576,352,4096,129,9,0,0,asm,12004+22001,804.8627 +gfx938,no_quant,torch.float16,640,352,4096,129,9,0,0,asm,12004+22001,820.5673 +gfx938,no_quant,torch.float16,768,352,4096,129,9,0,0,asm,12000+22001,862.1548 +gfx938,no_quant,torch.float16,960,352,4096,129,9,0,0,asm,12001+22001,993.7481 +gfx938,no_quant,torch.float16,1024,352,4096,129,9,0,0,asm,12001+22001,1051.5161 +gfx938,no_quant,torch.float16,1280,352,4096,129,9,0,0,asm,13001+23001,1115.6184 +gfx938,no_quant,torch.float16,1536,352,4096,129,9,0,0,asm,13001+23001,1145.0624 +gfx938,no_quant,torch.float16,1920,352,4096,129,9,0,0,asm,12001+22001,1517.5169 +gfx938,no_quant,torch.float16,2048,352,4096,129,9,0,0,asm,12001+22001,1602.4269 +gfx938,no_quant,torch.float16,2304,352,4096,129,9,0,0,asm,12001+22001,1694.8448 +gfx938,no_quant,torch.float16,2560,352,4096,129,9,0,0,asm,12001+22001,1768.7974 +gfx938,no_quant,torch.float16,2816,352,4096,129,9,0,0,asm,13001+23001,1804.2844 +gfx938,no_quant,torch.float16,3072,352,4096,129,9,0,0,asm,13001+23001,1848.514 +gfx938,no_quant,torch.float16,3584,352,4096,129,9,0,0,asm,13001+23001,2137.4626 +gfx938,no_quant,torch.float16,3840,352,4096,129,9,0,0,asm,13001+23001,2433.5947 +gfx938,no_quant,torch.float16,4096,352,4096,129,9,0,0,asm,13001+23001,2581.0639 +gfx938,no_quant,torch.float16,4608,352,4096,129,9,0,0,asm,13001+23001,2696.285 +gfx938,no_quant,torch.float16,5120,352,4096,129,9,0,0,asm,13001+23001,2818.4465 +gfx938,no_quant,torch.float16,5632,352,4096,129,9,0,0,asm,13001+23001,3244.0284 +gfx938,no_quant,torch.float16,6144,352,4096,129,9,0,0,asm,13001+23001,3530.5874 +gfx938,no_quant,torch.float16,6656,352,4096,129,9,0,0,asm,13001+23001,3601.098 +gfx938,no_quant,torch.float16,7168,352,4096,129,9,0,0,asm,13001+23001,3889.9903 +gfx938,no_quant,torch.float16,7680,352,4096,129,9,0,0,asm,13001+23001,4337.7228 +gfx938,no_quant,torch.float16,8192,352,4096,129,9,0,0,asm,13001+23001,4443.3674 +gfx938,no_quant,torch.float16,10240,352,4096,129,9,0,0,asm,13001+23001,5380.2134 +gfx938,no_quant,torch.float16,12288,352,4096,129,9,0,0,asm,13001+23001,6382.2687 +gfx938,no_quant,torch.float16,16384,352,4096,129,9,0,0,asm,13001+23001,8397.3437 +gfx938,no_quant,torch.float16,24576,352,4096,129,9,0,0,asm,13001+23001,12379.6318 +gfx938,no_quant,torch.float16,32768,352,4096,129,9,0,0,asm,13001+23001,16397.3012 +gfx938,no_quant,torch.float16,40960,352,4096,129,9,0,0,asm,13001+23001,20398.5288 +gfx938,no_quant,torch.float16,49152,352,4096,129,9,0,0,asm,13001+23001,24396.6972 +gfx938,no_quant,torch.float16,65536,352,4096,129,9,0,0,asm,13001+23001,32435.0655 +gfx936,no_quant,torch.float16,1,256,3072,256,8,0,0,asm,10006+20000,56.4327 +gfx936,no_quant,torch.float16,2,256,3072,256,8,0,0,asm,10006+20000,85.2664 +gfx936,no_quant,torch.float16,4,256,3072,256,8,0,0,asm,10004+20000,148.02 +gfx936,no_quant,torch.float16,6,256,3072,256,8,0,0,asm,10001+20000,198.0409 +gfx936,no_quant,torch.float16,8,256,3072,256,8,0,0,asm,10001+20000,237.9062 +gfx936,no_quant,torch.float16,12,256,3072,256,8,0,0,asm,10001+20000,320.1882 +gfx936,no_quant,torch.float16,16,256,3072,256,8,0,0,asm,10001+20000,388.3143 +gfx936,no_quant,torch.float16,24,256,3072,256,8,0,0,asm,10001+20000,520.2353 +gfx936,no_quant,torch.float16,32,256,3072,256,8,0,0,asm,10001+20000,634.3687 +gfx936,no_quant,torch.float16,36,256,3072,256,8,0,0,asm,10001+20000,662.1298 +gfx936,no_quant,torch.float16,48,256,3072,256,8,0,0,asm,10001+20000,745.2455 +gfx936,no_quant,torch.float16,56,256,3072,256,8,0,0,asm,10001+20000,784.2686 +gfx936,no_quant,torch.float16,64,256,3072,256,8,0,0,asm,10001+20000,822.5338 +gfx936,no_quant,torch.float16,72,256,3072,256,8,0,0,asm,10001+20000,850.3484 +gfx936,no_quant,torch.float16,80,256,3072,256,8,0,0,asm,10001+20000,875.1736 +gfx936,no_quant,torch.float16,88,256,3072,256,8,0,0,asm,10001+20000,901.001 +gfx936,no_quant,torch.float16,96,256,3072,256,8,0,0,asm,10001+20000,914.5758 +gfx936,no_quant,torch.float16,100,256,3072,256,8,0,0,asm,10001+20000,917.5063 +gfx936,no_quant,torch.float16,112,256,3072,256,8,0,0,asm,10001+20000,936.2515 +gfx936,no_quant,torch.float16,128,256,3072,256,8,0,0,asm,10001+20000,967.6199 +gfx936,no_quant,torch.float16,144,256,3072,256,8,0,0,asm,10001+20000,976.1083 +gfx936,no_quant,torch.float16,160,256,3072,256,8,0,0,asm,10001+20000,987.5608 +gfx936,no_quant,torch.float16,192,256,3072,256,8,0,0,asm,10001+20000,1007.7293 +gfx936,no_quant,torch.float16,224,256,3072,256,8,0,0,asm,10007+20000,1009.22 +gfx936,no_quant,torch.float16,256,256,3072,256,8,0,0,asm,10007+20000,1027.6957 +gfx936,no_quant,torch.float16,320,256,3072,256,8,0,0,asm,10007+20000,1043.8052 +gfx936,no_quant,torch.float16,384,256,3072,256,8,0,0,asm,10007+20000,1075.1736 +gfx936,no_quant,torch.float16,448,256,3072,256,8,0,0,asm,10006+20000,1110.1966 +gfx936,no_quant,torch.float16,512,256,3072,256,8,0,0,asm,11006+21001,1120.0746 +gfx936,no_quant,torch.float16,640,256,3072,256,8,0,0,asm,12004+22001,1151.2998 +gfx936,no_quant,torch.float16,768,256,3072,256,8,0,0,asm,12004+22001,1180.8324 +gfx936,no_quant,torch.float16,896,256,3072,256,8,0,0,asm,12004+22001,1201.9524 +gfx936,no_quant,torch.float16,1024,256,3072,256,8,0,0,asm,12004+22001,1221.1692 +gfx936,no_quant,torch.float16,1280,256,3072,256,8,0,0,asm,12004+22001,1256.6892 +gfx936,no_quant,torch.float16,1536,256,3072,256,8,0,0,asm,12004+22001,1313.9523 +gfx936,no_quant,torch.float16,2048,256,3072,256,8,0,0,asm,13000+23001,1467.2068 +gfx936,no_quant,torch.float16,2304,256,3072,256,8,0,0,asm,13000+23001,1512.9077 +gfx936,no_quant,torch.float16,2560,256,3072,256,8,0,0,asm,13000+23001,1530.7098 +gfx936,no_quant,torch.float16,3072,256,3072,256,8,0,0,asm,13000+23001,1615.5686 +gfx936,no_quant,torch.float16,3584,256,3072,256,8,0,0,asm,13000+23001,1724.7306 +gfx936,no_quant,torch.float16,4096,256,3072,256,8,0,0,asm,13001+23001,1918.8103 +gfx936,no_quant,torch.float16,5120,256,3072,256,8,0,0,asm,13001+23001,2272.5277 +gfx936,no_quant,torch.float16,6144,256,3072,256,8,0,0,asm,13001+23001,2343.0624 +gfx936,no_quant,torch.float16,7168,256,3072,256,8,0,0,asm,13001+23001,2560.4684 +gfx936,no_quant,torch.float16,8192,256,3072,256,8,0,0,asm,13001+23001,2878.8006 +gfx936,no_quant,torch.float16,10240,256,3072,256,8,0,0,asm,13001+23001,3414.3448 +gfx936,no_quant,torch.float16,12288,256,3072,256,8,0,0,asm,13001+23001,3988.8697 +gfx936,no_quant,torch.float16,16384,256,3072,256,8,0,0,asm,13001+23001,5057.6343 +gfx936,no_quant,torch.float16,24576,256,3072,256,8,0,0,asm,13001+23001,7355.5588 +gfx936,no_quant,torch.float16,32768,256,3072,256,8,0,0,asm,13001+23001,9745.8034 +gfx936,no_quant,torch.float16,1,128,3072,256,8,0,0,asm,10005+20000,42.0834 +gfx936,no_quant,torch.float16,2,128,3072,256,8,0,0,asm,10006+20000,57.5023 +gfx936,no_quant,torch.float16,4,128,3072,256,8,0,0,asm,10003+20000,89.1148 +gfx936,no_quant,torch.float16,6,128,3072,256,8,0,0,asm,10004+20000,115.9274 +gfx936,no_quant,torch.float16,8,128,3072,256,8,0,0,asm,10004+20000,140.0622 +gfx936,no_quant,torch.float16,12,128,3072,256,8,0,0,asm,10004+20000,183.4137 +gfx936,no_quant,torch.float16,16,128,3072,256,8,0,0,asm,10001+20000,217.6199 +gfx936,no_quant,torch.float16,24,128,3072,256,8,0,0,asm,10001+20000,280.0841 +gfx936,no_quant,torch.float16,32,128,3072,256,8,0,0,asm,10001+20000,347.9859 +gfx936,no_quant,torch.float16,36,128,3072,256,8,0,0,asm,10001+20002,354.8798 +gfx936,no_quant,torch.float16,48,128,3072,256,8,0,0,asm,10004+20000,397.8103 +gfx936,no_quant,torch.float16,56,128,3072,256,8,0,0,asm,10001+20000,417.8355 +gfx936,no_quant,torch.float16,64,128,3072,256,8,0,0,asm,10001+20000,431.9071 +gfx936,no_quant,torch.float16,72,128,3072,256,8,0,0,asm,10001+20000,449.6586 +gfx936,no_quant,torch.float16,80,128,3072,256,8,0,0,asm,10001+20000,466.3575 +gfx936,no_quant,torch.float16,88,128,3072,256,8,0,0,asm,10001+20000,476.4375 +gfx936,no_quant,torch.float16,96,128,3072,256,8,0,0,asm,10001+20000,484.1006 +gfx936,no_quant,torch.float16,100,128,3072,256,8,0,0,asm,10001+20000,483.6543 +gfx936,no_quant,torch.float16,112,128,3072,256,8,0,0,asm,10001+20000,496.6479 +gfx936,no_quant,torch.float16,128,128,3072,256,8,0,0,asm,10001+20000,506.4332 +gfx936,no_quant,torch.float16,144,128,3072,256,8,0,0,asm,10001+20000,514.7279 +gfx936,no_quant,torch.float16,160,128,3072,256,8,0,0,asm,10001+20000,518.9469 +gfx936,no_quant,torch.float16,192,128,3072,256,8,0,0,asm,10001+20002,528.2184 +gfx936,no_quant,torch.float16,224,128,3072,256,8,0,0,asm,10001+20000,536.9594 +gfx936,no_quant,torch.float16,256,128,3072,256,8,0,0,asm,10001+20002,542.1299 +gfx936,no_quant,torch.float16,320,128,3072,256,8,0,0,asm,10001+20002,560.412 +gfx936,no_quant,torch.float16,384,128,3072,256,8,0,0,asm,10007+20002,578.4752 +gfx936,no_quant,torch.float16,448,128,3072,256,8,0,0,asm,10006+20002,595.0982 +gfx936,no_quant,torch.float16,512,128,3072,256,8,0,0,asm,11006+21001,614.9719 +gfx936,no_quant,torch.float16,640,128,3072,256,8,0,0,asm,11006+21001,643.1908 +gfx936,no_quant,torch.float16,768,128,3072,256,8,0,0,asm,11006+21001,656.4455 +gfx936,no_quant,torch.float16,896,128,3072,256,8,0,0,asm,11006+21001,669.5487 +gfx936,no_quant,torch.float16,1024,128,3072,256,8,0,0,asm,11006+21001,709.4055 +gfx936,no_quant,torch.float16,1280,128,3072,256,8,0,0,asm,12004+22001,751.9485 +gfx936,no_quant,torch.float16,1536,128,3072,256,8,0,0,asm,12004+22001,792.2685 +gfx936,no_quant,torch.float16,2048,128,3072,256,8,0,0,asm,12004+22001,896.9672 +gfx936,no_quant,torch.float16,2304,128,3072,256,8,0,0,asm,13000+23001,963.6365 +gfx936,no_quant,torch.float16,2560,128,3072,256,8,0,0,asm,13000+23001,1003.0975 +gfx936,no_quant,torch.float16,3072,128,3072,256,8,0,0,asm,13000+23001,1068.8068 +gfx936,no_quant,torch.float16,3584,128,3072,256,8,0,0,asm,13000+23001,1157.8004 +gfx936,no_quant,torch.float16,4096,128,3072,256,8,0,0,asm,12001+22001,1282.457 +gfx936,no_quant,torch.float16,5120,128,3072,256,8,0,0,asm,12001+22001,1514.4903 +gfx936,no_quant,torch.float16,6144,128,3072,256,8,0,0,asm,13001+23001,1667.9553 +gfx936,no_quant,torch.float16,7168,128,3072,256,8,0,0,asm,13001+23001,1819.8287 +gfx936,no_quant,torch.float16,8192,128,3072,256,8,0,0,asm,13001+23001,2046.742 +gfx936,no_quant,torch.float16,10240,128,3072,256,8,0,0,asm,13001+23001,2443.8613 +gfx936,no_quant,torch.float16,12288,128,3072,256,8,0,0,asm,13001+23001,2844.9048 +gfx936,no_quant,torch.float16,16384,128,3072,256,8,0,0,asm,13001+23001,3597.2571 +gfx936,no_quant,torch.float16,24576,128,3072,256,8,0,0,asm,13001+23001,5205.65 +gfx936,no_quant,torch.float16,32768,128,3072,256,8,0,0,asm,13001+23001,6847.9883 diff --git a/aiter/configs/tuned_fmoe_asm_w4a16.csv b/aiter/configs/tuned_fmoe_asm_w4a16.csv new file mode 100644 index 0000000000000000000000000000000000000000..360987dcdabc72b5ff4956de245a03926003141d --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm_w4a16.csv @@ -0,0 +1,196 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,int4_w4a16,torch.float16,1,256,7168,256,8,0,0,asm,10000+20000,95.17214285714428 +gfx936,int4_w4a16,torch.float16,2,256,7168,256,8,0,0,asm,10000+20000,109.64071428571371 +gfx936,int4_w4a16,torch.float16,4,256,7168,256,8,0,0,asm,10000+20001,162.646428571428 +gfx936,int4_w4a16,torch.float16,6,256,7168,256,8,0,0,asm,10000+20000,224.8634285714288 +gfx936,int4_w4a16,torch.float16,8,256,7168,256,8,0,0,asm,10000+20000,245.8005714285721 +gfx936,int4_w4a16,torch.float16,10,256,7168,256,8,0,0,asm,10000+20001,298.3948571428573 +gfx936,int4_w4a16,torch.float16,12,256,7168,256,8,0,0,asm,10000+20001,304.8405714285706 +gfx936,int4_w4a16,torch.float16,14,256,7168,256,8,0,0,asm,10000+20001,386.76042857142886 +gfx936,int4_w4a16,torch.float16,16,256,7168,256,8,0,0,asm,10000+20001,421.7547142857137 +gfx936,int4_w4a16,torch.float16,20,256,7168,256,8,0,0,asm,10000+20000,517.134571428571 +gfx936,int4_w4a16,torch.float16,24,256,7168,256,8,0,0,asm,10000+20000,530.8487142857141 +gfx936,int4_w4a16,torch.float16,28,256,7168,256,8,0,0,asm,10000+20001,571.7202857142856 +gfx936,int4_w4a16,torch.float16,32,256,7168,256,8,0,0,asm,10000+20001,568.383142857143 +gfx936,int4_w4a16,torch.float16,36,256,7168,256,8,0,0,asm,10000+20001,652.1515714285714 +gfx936,int4_w4a16,torch.float16,40,256,7168,256,8,0,0,asm,10000+20001,700.4257142857138 +gfx936,int4_w4a16,torch.float16,44,256,7168,256,8,0,0,asm,10000+20001,705.5685714285721 +gfx936,int4_w4a16,torch.float16,48,256,7168,256,8,0,0,asm,10000+20001,696.4028571428581 +gfx936,int4_w4a16,torch.float16,56,256,7168,256,8,0,0,asm,10000+20001,794.9171428571426 +gfx936,int4_w4a16,torch.float16,64,256,7168,256,8,0,0,asm,10000+20001,814.9171428571414 +gfx936,int4_w4a16,torch.float16,80,256,7168,256,8,0,0,asm,10000+20001,829.8885714285714 +gfx936,int4_w4a16,torch.float16,96,256,7168,256,8,0,0,asm,10000+20001,911.3284285714284 +gfx936,int4_w4a16,torch.float16,112,256,7168,256,8,0,0,asm,10000+20001,918.1169999999996 +gfx936,int4_w4a16,torch.float16,128,256,7168,256,8,0,0,asm,10000+20001,926.8941428571424 +gfx936,int4_w4a16,torch.float16,160,256,7168,256,8,0,0,asm,10000+20001,941.3170000000003 +gfx936,int4_w4a16,torch.float16,192,256,7168,256,8,0,0,asm,10000+20001,958.139857142858 +gfx936,int4_w4a16,torch.float16,224,256,7168,256,8,0,0,asm,10000+20001,983.6941428571432 +gfx936,int4_w4a16,torch.float16,256,256,7168,256,8,0,0,asm,10000+20001,988.4255714285724 +gfx936,int4_w4a16,torch.float16,320,256,7168,256,8,0,0,asm,10000+20001,1010.7332857142854 +gfx936,int4_w4a16,torch.float16,384,256,7168,256,8,0,0,asm,10000+20001,1094.2761428571423 +gfx936,int4_w4a16,torch.float16,448,256,7168,256,8,0,0,asm,10000+20001,1153.590142857143 +gfx936,int4_w4a16,torch.float16,512,256,7168,256,8,0,0,asm,10000+20001,1306.458571428572 +gfx936,int4_w4a16,torch.float16,576,256,7168,256,8,0,0,asm,11001+21001,1410.2527142857143 +gfx936,int4_w4a16,torch.float16,640,256,7168,256,8,0,0,asm,11001+21001,1434.2071428571428 +gfx936,int4_w4a16,torch.float16,704,256,7168,256,8,0,0,asm,11001+21001,1496.423999999999 +gfx936,int4_w4a16,torch.float16,768,256,7168,256,8,0,0,asm,11001+21001,1520.1268571428573 +gfx936,int4_w4a16,torch.float16,832,256,7168,256,8,0,0,asm,11001+21001,1607.280999999999 +gfx936,int4_w4a16,torch.float16,896,256,7168,256,8,0,0,asm,11001+21001,1697.932285714286 +gfx936,int4_w4a16,torch.float16,960,256,7168,256,8,0,0,asm,11001+21001,1793.5892857142867 +gfx936,int4_w4a16,torch.float16,1024,256,7168,256,8,0,0,asm,11001+21001,1949.2920000000008 +gfx936,int4_w4a16,torch.float16,1152,256,7168,256,8,0,0,asm,11001+21001,2259.9428571428566 +gfx936,int4_w4a16,torch.float16,1280,256,7168,256,8,0,0,asm,11001+21001,2472.994 +gfx936,int4_w4a16,torch.float16,1408,256,7168,256,8,0,0,asm,11001+21001,2536.4909999999995 +gfx936,int4_w4a16,torch.float16,1536,256,7168,256,8,0,0,asm,11001+21001,2635.736571428572 +gfx936,int4_w4a16,torch.float16,1664,256,7168,256,8,0,0,asm,11001+21001,2727.5991428571456 +gfx936,int4_w4a16,torch.float16,1792,256,7168,256,8,0,0,asm,11001+21001,2847.301857142858 +gfx936,int4_w4a16,torch.float16,1920,256,7168,256,8,0,0,asm,11001+21001,3030.9815714285733 +gfx936,int4_w4a16,torch.float16,2048,256,7168,256,8,0,0,asm,11001+21001,3253.38114285714 +gfx936,int4_w4a16,torch.float16,2304,256,7168,256,8,0,0,asm,11001+21001,3691.437714285715 +gfx936,int4_w4a16,torch.float16,2560,256,7168,256,8,0,0,asm,11001+21001,3961.105857142853 +gfx936,int4_w4a16,torch.float16,2816,256,7168,256,8,0,0,asm,11001+21001,4214.888285714294 +gfx936,int4_w4a16,torch.float16,3072,256,7168,256,8,0,0,asm,11001+21001,4598.590571428569 +gfx936,int4_w4a16,torch.float16,3328,256,7168,256,8,0,0,asm,11001+21001,4923.961571428568 +gfx936,int4_w4a16,torch.float16,3584,256,7168,256,8,0,0,asm,11001+21001,5283.663857142858 +gfx936,int4_w4a16,torch.float16,3840,256,7168,256,8,0,0,asm,11001+21001,5579.549000000005 +gfx936,int4_w4a16,torch.float16,4096,256,7168,256,8,0,0,asm,11001+21001,5959.091142857148 +gfx936,int4_w4a16,torch.float16,4608,256,7168,256,8,0,0,asm,11001+21001,6573.558857142861 +gfx936,int4_w4a16,torch.float16,5120,256,7168,256,8,0,0,asm,11001+21001,7240.826571428568 +gfx936,int4_w4a16,torch.float16,5632,256,7168,256,8,0,0,asm,11001+21001,7880.962714285714 +gfx936,int4_w4a16,torch.float16,6144,256,7168,256,8,0,0,asm,11001+21001,8558.858571428571 +gfx936,int4_w4a16,torch.float16,6656,256,7168,256,8,0,0,asm,11001+21001,9222.514857142862 +gfx936,int4_w4a16,torch.float16,7168,256,7168,256,8,0,0,asm,11001+21001,9888.730857142857 +gfx936,int4_w4a16,torch.float16,7680,256,7168,256,8,0,0,asm,11001+21001,10493.644714285716 +gfx936,int4_w4a16,torch.float16,8192,256,7168,256,8,0,0,asm,11001+21001,11209.300285714282 +gfx936,int4_w4a16,torch.float16,10240,256,7168,256,8,0,0,asm,11001+21001,13796.542142857152 +gfx936,int4_w4a16,torch.float16,12288,256,7168,256,8,0,0,asm,11001+21001,16473.77242857143 +gfx936,int4_w4a16,torch.float16,14336,256,7168,256,8,0,0,asm,11001+21001,19168.557142857146 +gfx936,int4_w4a16,torch.float16,16384,256,7168,256,8,0,0,asm,11001+21001,21783.615857142864 +gfx936,int4_w4a16,torch.float16,17408,256,7168,256,8,0,0,asm,11001+21001,23135.63657142856 +gfx936,int4_w4a16,torch.float16,24576,256,7168,256,8,0,0,asm,11001+21001,32335.211 +gfx936,int4_w4a16,torch.float16,32768,256,7168,256,8,0,0,asm,11001+21001,42933.38957142859 +gfx938,int4_w4a16,torch.float16,1,256,7168,256,8,0,0,asm,10000+20000,105.43471428539071 +gfx938,int4_w4a16,torch.float16,2,256,7168,256,8,0,0,asm,10000+20000,127.08042857237159 +gfx938,int4_w4a16,torch.float16,4,256,7168,256,8,0,0,asm,10000+20001,192.08599999959446 +gfx938,int4_w4a16,torch.float16,6,256,7168,256,8,0,0,asm,10000+20000,284.08557142796263 +gfx938,int4_w4a16,torch.float16,8,256,7168,256,8,0,0,asm,10000+20002,344.4512857141838 +gfx938,int4_w4a16,torch.float16,10,256,7168,256,8,0,0,asm,10000+20000,440.88528571384296 +gfx938,int4_w4a16,torch.float16,12,256,7168,256,8,0,0,asm,10000+20001,501.20514285670856 +gfx938,int4_w4a16,torch.float16,14,256,7168,256,8,0,0,asm,10000+20000,596.9989999990378 +gfx938,int4_w4a16,torch.float16,16,256,7168,256,8,0,0,asm,10000+20002,605.4334285715928 +gfx938,int4_w4a16,torch.float16,20,256,7168,256,8,0,0,asm,10000+20001,750.7557142856531 +gfx938,int4_w4a16,torch.float16,24,256,7168,256,8,0,0,asm,10000+20001,756.40142857137 +gfx938,int4_w4a16,torch.float16,28,256,7168,256,8,0,0,asm,10000+20001,825.547142857087 +gfx938,int4_w4a16,torch.float16,32,256,7168,256,8,0,0,asm,10000+20001,836.5414285711678 +gfx938,int4_w4a16,torch.float16,36,256,7168,256,8,0,0,asm,10000+20002,911.303857142904 +gfx938,int4_w4a16,torch.float16,40,256,7168,256,8,0,0,asm,10000+20002,967.166571428567 +gfx938,int4_w4a16,torch.float16,44,256,7168,256,8,0,0,asm,10000+20001,1061.2692857140541 +gfx938,int4_w4a16,torch.float16,48,256,7168,256,8,0,0,asm,10000+20001,1066.9377142857495 +gfx938,int4_w4a16,torch.float16,56,256,7168,256,8,0,0,asm,10000+20001,1116.629000000057 +gfx938,int4_w4a16,torch.float16,64,256,7168,256,8,0,0,asm,10000+20002,1193.4288571427876 +gfx938,int4_w4a16,torch.float16,80,256,7168,256,8,0,0,asm,10000+20002,1217.9315714285476 +gfx938,int4_w4a16,torch.float16,96,256,7168,256,8,0,0,asm,10000+20002,1250.1371428567384 +gfx938,int4_w4a16,torch.float16,112,256,7168,256,8,0,0,asm,10000+20002,1273.9772857142877 +gfx938,int4_w4a16,torch.float16,128,256,7168,256,8,0,0,asm,10000+20002,1276.6285714285416 +gfx938,int4_w4a16,torch.float16,160,256,7168,256,8,0,0,asm,10000+20002,1278.320142857198 +gfx938,int4_w4a16,torch.float16,192,256,7168,256,8,0,0,asm,10000+20002,1289.4514285716493 +gfx938,int4_w4a16,torch.float16,224,256,7168,256,8,0,0,asm,10000+20002,1295.0284285714984 +gfx938,int4_w4a16,torch.float16,256,256,7168,256,8,0,0,asm,10000+20001,1408.8795714286555 +gfx938,int4_w4a16,torch.float16,320,256,7168,256,8,0,0,asm,10000+20001,1426.959571428597 +gfx938,int4_w4a16,torch.float16,384,256,7168,256,8,0,0,asm,10000+20001,1486.8908571428951 +gfx938,int4_w4a16,torch.float16,448,256,7168,256,8,0,0,asm,10000+20002,1611.6447142859522 +gfx938,int4_w4a16,torch.float16,512,256,7168,256,8,0,0,asm,10000+20002,1923.9182857143958 +gfx938,int4_w4a16,torch.float16,576,256,7168,256,8,0,0,asm,10000+20002,2170.1348571429094 +gfx938,int4_w4a16,torch.float16,640,256,7168,256,8,0,0,asm,10000+20001,2377.3797142855556 +gfx938,int4_w4a16,torch.float16,704,256,7168,256,8,0,0,asm,10000+20002,2500.55657142893 +gfx938,int4_w4a16,torch.float16,768,256,7168,256,8,0,0,asm,10000+20002,2559.2078571429342 +gfx938,int4_w4a16,torch.float16,832,256,7168,256,8,0,0,asm,10000+20001,2760.692999999305 +gfx938,int4_w4a16,torch.float16,896,256,7168,256,8,0,0,asm,10000+20002,2826.293000000263 +gfx938,int4_w4a16,torch.float16,960,256,7168,256,8,0,0,asm,10000+20002,3070.77228571433 +gfx938,int4_w4a16,torch.float16,1024,256,7168,256,8,0,0,asm,10000+20002,3187.7548571428715 +gfx938,int4_w4a16,torch.float16,1152,256,7168,256,8,0,0,asm,10000+20002,3501.8795714285225 +gfx938,int4_w4a16,torch.float16,1280,256,7168,256,8,0,0,asm,10000+20002,3815.9817142858437 +gfx938,int4_w4a16,torch.float16,1408,256,7168,256,8,0,0,asm,10000+20002,4135.843714285708 +gfx938,int4_w4a16,torch.float16,1536,256,7168,256,8,0,0,asm,10000+20002,4453.054285714236 +gfx938,int4_w4a16,torch.float16,1664,256,7168,256,8,0,0,asm,10000+20002,4908.824428571521 +gfx938,int4_w4a16,torch.float16,1792,256,7168,256,8,0,0,asm,10000+20002,5088.504000000057 +gfx938,int4_w4a16,torch.float16,1920,256,7168,256,8,0,0,asm,10000+20002,5400.8005714285455 +gfx938,int4_w4a16,torch.float16,2048,256,7168,256,8,0,0,asm,10000+20002,5717.8740000000025 +gfx938,int4_w4a16,torch.float16,2304,256,7168,256,8,0,0,asm,10000+20002,6355.015000000048 +gfx938,int4_w4a16,torch.float16,2560,256,7168,256,8,0,0,asm,10000+20002,7130.669857142909 +gfx938,int4_w4a16,torch.float16,2816,256,7168,256,8,0,0,asm,10000+20002,7623.331428571405 +gfx938,int4_w4a16,torch.float16,3072,256,7168,256,8,0,0,asm,10000+20002,8248.769285714088 +gfx938,int4_w4a16,torch.float16,3328,256,7168,256,8,0,0,asm,10000+20002,8877.841571428573 +gfx938,int4_w4a16,torch.float16,3584,256,7168,256,8,0,0,asm,10000+20002,9516.399999999574 +gfx938,int4_w4a16,torch.float16,3840,256,7168,256,8,0,0,asm,10000+20002,10147.803285714293 +gfx938,int4_w4a16,torch.float16,4096,256,7168,256,8,0,0,asm,10000+20002,10784.464142857187 +gfx938,int4_w4a16,torch.float16,4608,256,7168,256,8,0,0,asm,10000+20002,12002.792285714431 +gfx938,int4_w4a16,torch.float16,5120,256,7168,256,8,0,0,asm,10000+20002,13309.120142857179 +gfx938,int4_w4a16,torch.float16,5632,256,7168,256,8,0,0,asm,10000+20002,14576.659714285517 +gfx938,int4_w4a16,torch.float16,6144,256,7168,256,8,0,0,asm,10000+20002,15842.21071428606 +gfx938,int4_w4a16,torch.float16,6656,256,7168,256,8,0,0,asm,10000+20002,17109.932714285194 +gfx938,int4_w4a16,torch.float16,7168,256,7168,256,8,0,0,asm,10000+20002,18361.67728571487 +gfx938,int4_w4a16,torch.float16,7680,256,7168,256,8,0,0,asm,10000+20002,19628.737285714065 +gfx938,int4_w4a16,torch.float16,8192,256,7168,256,8,0,0,asm,10000+20002,20897.122571428772 +gfx938,int4_w4a16,torch.float16,10240,256,7168,256,8,0,0,asm,10000+20002,25957.222142856703 +gfx938,int4_w4a16,torch.float16,12288,256,7168,256,8,0,0,asm,10000+20002,31018.37357142857 +gfx938,int4_w4a16,torch.float16,14336,256,7168,256,8,0,0,asm,10000+20002,36075.15457142866 +gfx938,int4_w4a16,torch.float16,16384,256,7168,256,8,0,0,asm,10000+20002,41135.762714286466 +gfx938,int4_w4a16,torch.float16,17408,256,7168,256,8,0,0,asm,10000+20002,43667.20685714267 +gfx938,int4_w4a16,torch.float16,24576,256,7168,256,8,0,0,asm,10000+20002,61364.23214285701 +gfx938,int4_w4a16,torch.float16,32768,256,7168,256,8,0,0,asm,10000+20002,81594.00357142859 +gfx938,int4_w4a16,torch.float16,1,256,7168,384,8,0,0,asm,10000+20000,108.1967 +gfx938,int4_w4a16,torch.float16,2,256,7168,384,8,0,0,asm,10000+20000,126.5040 +gfx938,int4_w4a16,torch.float16,4,256,7168,384,8,0,0,asm,10000+20001,196.5924 +gfx938,int4_w4a16,torch.float16,6,256,7168,384,8,0,0,asm,10000+20000,290.6636 +gfx938,int4_w4a16,torch.float16,8,256,7168,384,8,0,0,asm,10000+20002,348.1119 +gfx938,int4_w4a16,torch.float16,10,256,7168,384,8,0,0,asm,10000+20000,445.9054 +gfx938,int4_w4a16,torch.float16,12,256,7168,384,8,0,0,asm,10000+20001,511.1852 +gfx938,int4_w4a16,torch.float16,14,256,7168,384,8,0,0,asm,10000+20001,519.2441 +gfx938,int4_w4a16,torch.float16,16,256,7168,384,8,0,0,asm,10000+20002,616.4397 +gfx938,int4_w4a16,torch.float16,20,256,7168,384,8,0,0,asm,10000+20001,753.6237 +gfx938,int4_w4a16,torch.float16,24,256,7168,384,8,0,0,asm,10000+20001,817.4551 +gfx938,int4_w4a16,torch.float16,28,256,7168,384,8,0,0,asm,10000+20002,912.1243 +gfx938,int4_w4a16,torch.float16,32,256,7168,384,8,0,0,asm,10000+20002,963.9222 +gfx938,int4_w4a16,torch.float16,36,256,7168,384,8,0,0,asm,10000+20001,1108.6964 +gfx938,int4_w4a16,torch.float16,40,256,7168,384,8,0,0,asm,10000+20002,1200.7804 +gfx938,int4_w4a16,torch.float16,44,256,7168,384,8,0,0,asm,10000+20002,1252.2244 +gfx938,int4_w4a16,torch.float16,48,256,7168,384,8,0,0,asm,10000+20002,1264.3423 +gfx938,int4_w4a16,torch.float16,56,256,7168,384,8,0,0,asm,10000+20001,1378.8346 +gfx938,int4_w4a16,torch.float16,64,256,7168,384,8,0,0,asm,10000+20002,1498.6154 +gfx938,int4_w4a16,torch.float16,80,256,7168,384,8,0,0,asm,10000+20002,1581.3351 +gfx938,int4_w4a16,torch.float16,96,256,7168,384,8,0,0,asm,10000+20001,1686.9685 +gfx938,int4_w4a16,torch.float16,112,256,7168,384,8,0,0,asm,10000+20001,1739.7936 +gfx938,int4_w4a16,torch.float16,128,256,7168,384,8,0,0,asm,10000+20002,1813.1493 +gfx938,int4_w4a16,torch.float16,160,256,7168,384,8,0,0,asm,10000+20002,1869.6711 +gfx938,int4_w4a16,torch.float16,192,256,7168,384,8,0,0,asm,10000+20002,1879.8691 +gfx938,int4_w4a16,torch.float16,224,256,7168,384,8,0,0,asm,10000+20002,1898.5048 +gfx938,int4_w4a16,torch.float16,256,256,7168,384,8,0,0,asm,10000+20002,1908.3069 +gfx938,int4_w4a16,torch.float16,320,256,7168,384,8,0,0,asm,10000+20002,1914.6057 +gfx938,int4_w4a16,torch.float16,384,256,7168,384,8,0,0,asm,10000+20001,2022.0243 +gfx938,int4_w4a16,torch.float16,448,256,7168,384,8,0,0,asm,10000+20001,2031.7760 +gfx938,int4_w4a16,torch.float16,512,256,7168,384,8,0,0,asm,10000+20001,2056.854 +gfx938,int4_w4a16,torch.float16,576,256,7168,384,8,0,0,asm,10000+20002,2222.1249 +gfx938,int4_w4a16,torch.float16,640,256,7168,384,8,0,0,asm,10000+20001,2354.6466 +gfx938,int4_w4a16,torch.float16,768,256,7168,384,8,0,0,asm,10000+20002,2802.9235 +gfx938,int4_w4a16,torch.float16,896,256,7168,384,8,0,0,asm,10000+20002,3359.1829 +gfx938,int4_w4a16,torch.float16,1024,256,7168,384,8,0,0,asm,10000+20002,3622.8789 +gfx938,int4_w4a16,torch.float16,1280,256,7168,384,8,0,0,asm,10000+20002,4052.1077 +gfx938,int4_w4a16,torch.float16,1536,256,7168,384,8,0,0,asm,10000+20002,4672.0215 +gfx938,int4_w4a16,torch.float16,2048,256,7168,384,8,0,0,asm,10000+20002,5924.0686 +gfx938,int4_w4a16,torch.float16,2560,256,7168,384,8,0,0,asm,10000+20002,7228.8482 +gfx938,int4_w4a16,torch.float16,3072,256,7168,384,8,0,0,asm,10000+20002,8496.5672 +gfx938,int4_w4a16,torch.float16,3584,256,7168,384,8,0,0,asm,10000+20002,9699.309 +gfx938,int4_w4a16,torch.float16,3840,256,7168,384,8,0,0,asm,10000+20002,10366.0861 +gfx938,int4_w4a16,torch.float16,4096,256,7168,384,8,0,0,asm,10000+20002,10946.9519 +gfx938,int4_w4a16,torch.float16,4608,256,7168,384,8,0,0,asm,10000+20002,12245.0451 +gfx938,int4_w4a16,torch.float16,5120,256,7168,384,8,0,0,asm,10000+20002,13491.7789 +gfx938,int4_w4a16,torch.float16,6144,256,7168,384,8,0,0,asm,10000+20002,15987.2496 +gfx938,int4_w4a16,torch.float16,7168,256,7168,384,8,0,0,asm,10000+20002,18455.4978 +gfx938,int4_w4a16,torch.float16,7680,256,7168,384,8,0,0,asm,10000+20002,19750.3316 +gfx938,int4_w4a16,torch.float16,8192,256,7168,384,8,0,0,asm,10000+20002,21000.2742 +gfx938,int4_w4a16,torch.float16,10240,256,7168,384,8,0,0,asm,10000+20002,26001.247 +gfx938,int4_w4a16,torch.float16,12288,256,7168,384,8,0,0,asm,10000+20002,30978.9432 +gfx938,int4_w4a16,torch.float16,16384,256,7168,384,8,0,0,asm,10000+20002,40987.1437 +gfx938,int4_w4a16,torch.float16,24576,256,7168,384,8,0,0,asm,10000+20002,60960.2526 +gfx938,int4_w4a16,torch.float16,32768,256,7168,384,8,0,0,asm,10000+20002,80978.0681 diff --git a/aiter/configs/tuned_fmoe_asm_w4a8_group.csv b/aiter/configs/tuned_fmoe_asm_w4a8_group.csv new file mode 100644 index 0000000000000000000000000000000000000000..980ba7cfddad4d78491ae2d950bd7f24aceb8e8a --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm_w4a8_group.csv @@ -0,0 +1,95 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,int4_w4a8,torch.float16,1,256,7168,256,8,0,0,asm,10001+20100,121.2912857142849 +gfx936,int4_w4a8,torch.float16,2,256,7168,256,8,0,0,asm,10001+20000,137.86271428571425 +gfx936,int4_w4a8,torch.float16,4,256,7168,256,8,0,0,asm,10002+20101,181.22271428571534 +gfx936,int4_w4a8,torch.float16,6,256,7168,256,8,0,0,asm,10001+20100,243.78257142857066 +gfx936,int4_w4a8,torch.float16,8,256,7168,256,8,0,0,asm,10001+20101,270.7997142857147 +gfx936,int4_w4a8,torch.float16,10,256,7168,256,8,0,0,asm,10001+20101,305.10828571428607 +gfx936,int4_w4a8,torch.float16,12,256,7168,256,8,0,0,asm,10002+20101,376.28528571428666 +gfx936,int4_w4a8,torch.float16,14,256,7168,256,8,0,0,asm,10001+20101,385.5881428571429 +gfx936,int4_w4a8,torch.float16,16,256,7168,256,8,0,0,asm,10002+20101,415.09671428571426 +gfx936,int4_w4a8,torch.float16,20,256,7168,256,8,0,0,asm,10001+20101,496.4679999999984 +gfx936,int4_w4a8,torch.float16,24,256,7168,256,8,0,0,asm,10001+20101,504.7651428571422 +gfx936,int4_w4a8,torch.float16,28,256,7168,256,8,0,0,asm,10001+20101,546.5937142857139 +gfx936,int4_w4a8,torch.float16,32,256,7168,256,8,0,0,asm,10001+20101,552.2165714285693 +gfx936,int4_w4a8,torch.float16,36,256,7168,256,8,0,0,asm,10001+20101,629.9535714285719 +gfx936,int4_w4a8,torch.float16,40,256,7168,256,8,0,0,asm,10002+20101,665.0164285714278 +gfx936,int4_w4a8,torch.float16,44,256,7168,256,8,0,0,asm,10001+20101,673.3135714285709 +gfx936,int4_w4a8,torch.float16,48,256,7168,256,8,0,0,asm,10001+20101,732.5589999999995 +gfx936,int4_w4a8,torch.float16,56,256,7168,256,8,0,0,asm,10001+20101,769.8848571428576 +gfx936,int4_w4a8,torch.float16,64,256,7168,256,8,0,0,asm,10001+20101,783.5077142857135 +gfx936,int4_w4a8,torch.float16,72,256,7168,256,8,0,0,asm,10002+20101,816.6962857142868 +gfx936,int4_w4a8,torch.float16,80,256,7168,256,8,0,0,asm,10001+20101,812.2162857142861 +gfx936,int4_w4a8,torch.float16,96,256,7168,256,8,0,0,asm,10001+20101,834.7534285714281 +gfx936,int4_w4a8,torch.float16,112,256,7168,256,8,0,0,asm,10001+20101,888.2618571428568 +gfx936,int4_w4a8,torch.float16,128,256,7168,256,8,0,0,asm,10001+20101,894.7989999999998 +gfx936,int4_w4a8,torch.float16,160,256,7168,256,8,0,0,asm,10002+20101,916.1018571428574 +gfx936,int4_w4a8,torch.float16,192,256,7168,256,8,0,0,asm,10002+20101,937.0160000000004 +gfx936,int4_w4a8,torch.float16,224,256,7168,256,8,0,0,asm,10002+20101,980.3990000000005 +gfx936,int4_w4a8,torch.float16,256,256,7168,256,8,0,0,asm,10002+20101,989.221714285714 +gfx936,int4_w4a8,torch.float16,320,256,7168,256,8,0,0,asm,10002+20101,1032.2160000000001 +gfx936,int4_w4a8,torch.float16,384,256,7168,256,8,0,0,asm,10002+20101,1075.0731428571432 +gfx936,int4_w4a8,torch.float16,448,256,7168,256,8,0,0,asm,10002+20101,1112.9245714285714 +gfx936,int4_w4a8,torch.float16,512,256,7168,256,8,0,0,asm,11001+21101,1172.170142857143 +gfx936,int4_w4a8,torch.float16,576,256,7168,256,8,0,0,asm,11001+21101,1200.741428571429 +gfx936,int4_w4a8,torch.float16,640,256,7168,256,8,0,0,asm,11001+21101,1236.4672857142875 +gfx936,int4_w4a8,torch.float16,704,256,7168,256,8,0,0,asm,11001+21101,1269.9985714285708 +gfx936,int4_w4a8,torch.float16,768,256,7168,256,8,0,0,asm,11001+21101,1321.54142857143 +gfx936,int4_w4a8,torch.float16,832,256,7168,256,8,0,0,asm,11001+21101,1423.6670000000001 +gfx936,int4_w4a8,torch.float16,896,256,7168,256,8,0,0,asm,11001+21101,1467.1412857142861 +gfx936,int4_w4a8,torch.float16,960,256,7168,256,8,0,0,asm,11001+21101,1529.1754285714298 +gfx936,int4_w4a8,torch.float16,1024,256,7168,256,8,0,0,asm,11001+21101,1651.9409999999991 +gfx936,int4_w4a8,torch.float16,1152,256,7168,256,8,0,0,asm,12000+22101,1841.1522857142859 +gfx936,int4_w4a8,torch.float16,1280,256,7168,256,8,0,0,asm,12000+22101,1928.740714285715 +gfx936,int4_w4a8,torch.float16,1408,256,7168,256,8,0,0,asm,12000+22101,2031.506428571427 +gfx936,int4_w4a8,torch.float16,1536,256,7168,256,8,0,0,asm,12000+22100,2093.7921428571417 +gfx936,int4_w4a8,torch.float16,1664,256,7168,256,8,0,0,asm,12000+22100,2159.643428571429 +gfx936,int4_w4a8,torch.float16,1792,256,7168,256,8,0,0,asm,12000+22100,2340.2147142857125 +gfx936,int4_w4a8,torch.float16,1920,256,7168,256,8,0,0,asm,12000+22101,2536.511571428572 +gfx936,int4_w4a8,torch.float16,2048,256,7168,256,8,0,0,asm,12000+22101,2711.5742857142855 +gfx936,int4_w4a8,torch.float16,2304,256,7168,256,8,0,0,asm,12000+22101,3163.2538571428563 +gfx936,int4_w4a8,torch.float16,2560,256,7168,256,8,0,0,asm,11001+21101,3370.6592857142864 +gfx936,int4_w4a8,torch.float16,2816,256,7168,256,8,0,0,asm,12000+22101,3553.2875714285706 +gfx936,int4_w4a8,torch.float16,3072,256,7168,256,8,0,0,asm,12000+22101,3640.8532857142886 +gfx936,int4_w4a8,torch.float16,3328,256,7168,256,8,0,0,asm,12000+22101,3754.5674285714254 +gfx936,int4_w4a8,torch.float16,3584,256,7168,256,8,0,0,asm,12000+22101,4066.704285714291 +gfx936,int4_w4a8,torch.float16,3840,256,7168,256,8,0,0,asm,12000+22101,4285.606857142853 +gfx936,int4_w4a8,torch.float16,4096,256,7168,256,8,0,0,asm,12000+22101,4511.504 +gfx936,int4_w4a8,torch.float16,4608,256,7168,256,8,0,0,asm,12000+22101,5246.771714285715 +gfx936,int4_w4a8,torch.float16,5120,256,7168,256,8,0,0,asm,12000+22101,5601.42285714286 +gfx936,int4_w4a8,torch.float16,5632,256,7168,256,8,0,0,asm,12000+22101,5841.239571428574 +gfx936,int4_w4a8,torch.float16,6144,256,7168,256,8,0,0,asm,12000+22101,6483.31914285714 +gfx936,int4_w4a8,torch.float16,6656,256,7168,256,8,0,0,asm,12000+22101,7094.564285714291 +gfx936,int4_w4a8,torch.float16,7168,256,7168,256,8,0,0,asm,12000+22101,7407.546857142855 +gfx936,int4_w4a8,torch.float16,7680,256,7168,256,8,0,0,asm,12000+22101,7759.112571428579 +gfx936,int4_w4a8,torch.float16,8192,256,7168,256,8,0,0,asm,12000+22101,8344.666142857135 +gfx936,int4_w4a8,torch.float16,10240,256,7168,256,8,0,0,asm,12000+22101,10278.127285714294 +gfx936,int4_w4a8,torch.float16,12288,256,7168,256,8,0,0,asm,12000+22101,12087.954142857148 +gfx936,int4_w4a8,torch.float16,14336,256,7168,256,8,0,0,asm,12000+22101,13912.912571428593 +gfx936,int4_w4a8,torch.float16,16384,256,7168,256,8,0,0,asm,12000+22101,15923.08257142857 +gfx936,int4_w4a8,torch.float16,17408,256,7168,256,8,0,0,asm,13000+23101,17620.064000000006 +gfx936,int4_w4a8,torch.float16,24576,256,7168,256,8,0,0,asm,13000+23101,22840.699571428548 +gfx936,int4_w4a8,torch.float16,32768,256,7168,256,8,0,0,asm,13000+23101,29839.300142857148 +gfx936,int4_w4a8,torch.float16,1,128,7168,256,8,0,0,asm,10001+20000,118.84557142857135 +gfx936,int4_w4a8,torch.float16,8,128,7168,256,8,0,0,asm,10002+20000,210.50271428571457 +gfx936,int4_w4a8,torch.float16,32,128,7168,256,8,0,0,asm,10002+20000,410.98257142857136 +gfx936,int4_w4a8,torch.float16,48,128,7168,256,8,0,0,asm,10001+20000,486.6395714285716 +gfx936,int4_w4a8,torch.float16,64,128,7168,256,8,0,0,asm,10001+20000,507.0281428571426 +gfx936,int4_w4a8,torch.float16,80,128,7168,256,8,0,0,asm,10001+20000,530.9595714285716 +gfx936,int4_w4a8,torch.float16,96,128,7168,256,8,0,0,asm,10002+20000,564.7651428571434 +gfx936,int4_w4a8,torch.float16,128,128,7168,256,8,0,0,asm,10002+20000,588.8794285714288 +gfx936,int4_w4a8,torch.float16,256,128,7168,256,8,0,0,asm,10002+20000,651.4622857142858 +gfx936,int4_w4a8,torch.float16,512,128,7168,256,8,0,0,asm,11001+21000,816.7192857142861 +gfx936,int4_w4a8,torch.float16,768,128,7168,256,8,0,0,asm,11001+21000,905.7248571428562 +gfx936,int4_w4a8,torch.float16,1024,128,7168,256,8,0,0,asm,11001+21000,1194.478857142858 +gfx936,int4_w4a8,torch.float16,2048,128,7168,256,8,0,0,asm,11001+21000,2014.7065714285711 +gfx936,int4_w4a8,torch.float16,3072,128,7168,256,8,0,0,asm,13000+23000,2794.6144285714277 +gfx936,int4_w4a8,torch.float16,4096,128,7168,256,8,0,0,asm,12000+22000,3567.93942857143 +gfx936,int4_w4a8,torch.float16,5120,128,7168,256,8,0,0,asm,12000+22000,4365.652857142859 +gfx936,int4_w4a8,torch.float16,6144,128,7168,256,8,0,0,asm,12000+22000,5152.532285714286 +gfx936,int4_w4a8,torch.float16,8192,128,7168,256,8,0,0,asm,13000+23000,6526.58800000001 +gfx936,int4_w4a8,torch.float16,10240,128,7168,256,8,0,0,asm,13000+23000,8007.180714285722 +gfx936,int4_w4a8,torch.float16,12288,128,7168,256,8,0,0,asm,13000+23000,9478.585142857133 +gfx936,int4_w4a8,torch.float16,16384,128,7168,256,8,0,0,asm,13000+23000,12350.97114285713 +gfx936,int4_w4a8,torch.float16,24576,128,7168,256,8,0,0,asm,13000+23000,17921.823142857134 +gfx936,int4_w4a8,torch.float16,32768,128,7168,256,8,0,0,asm,13000+23000,23625.54328571425 diff --git a/aiter/configs/tuned_fmoe_asm_w8a8_channel.csv b/aiter/configs/tuned_fmoe_asm_w8a8_channel.csv new file mode 100644 index 0000000000000000000000000000000000000000..b98bb247e9b86cd31993fe7efac742bb965e6b0b --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm_w8a8_channel.csv @@ -0,0 +1,1044 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,int8_w8a8_channel,torch.float16,1,128,7168,256,8,0,0,asm,10000+20100,62.7804 +gfx936,int8_w8a8_channel,torch.float16,2,128,7168,256,8,0,0,asm,10006+20102,78.3931 +gfx936,int8_w8a8_channel,torch.float16,3,128,7168,256,8,0,0,asm,10009+20102,97.3237 +gfx936,int8_w8a8_channel,torch.float16,4,128,7168,256,8,0,0,asm,10010+20102,117.012 +gfx936,int8_w8a8_channel,torch.float16,5,128,7168,256,8,0,0,asm,10011+20102,138.9489 +gfx936,int8_w8a8_channel,torch.float16,6,128,7168,256,8,0,0,asm,10001+20102,161.0289 +gfx936,int8_w8a8_channel,torch.float16,7,128,7168,256,8,0,0,asm,10013+20102,177.4583 +gfx936,int8_w8a8_channel,torch.float16,8,128,7168,256,8,0,0,asm,10001+20102,187.4036 +gfx936,int8_w8a8_channel,torch.float16,9,128,7168,256,8,0,0,asm,10000+20102,202.8142 +gfx936,int8_w8a8_channel,torch.float16,10,128,7168,256,8,0,0,asm,10001+20102,214.7384 +gfx936,int8_w8a8_channel,torch.float16,11,128,7168,256,8,0,0,asm,10000+20102,231.7405 +gfx936,int8_w8a8_channel,torch.float16,12,128,7168,256,8,0,0,asm,10001+20102,245.029 +gfx936,int8_w8a8_channel,torch.float16,13,128,7168,256,8,0,0,asm,10001+20102,264.4648 +gfx936,int8_w8a8_channel,torch.float16,14,128,7168,256,8,0,0,asm,10001+20102,276.8016 +gfx936,int8_w8a8_channel,torch.float16,15,128,7168,256,8,0,0,asm,10000+20102,287.3027 +gfx936,int8_w8a8_channel,torch.float16,16,128,7168,256,8,0,0,asm,10000+20102,291.2606 +gfx936,int8_w8a8_channel,torch.float16,17,128,7168,256,8,0,0,asm,10001+20102,294.9376 +gfx936,int8_w8a8_channel,torch.float16,18,128,7168,256,8,0,0,asm,10000+20102,310.9124 +gfx936,int8_w8a8_channel,torch.float16,20,128,7168,256,8,0,0,asm,10001+20001,336.7987 +gfx936,int8_w8a8_channel,torch.float16,24,128,7168,256,8,0,0,asm,10001+20001,378.6513 +gfx936,int8_w8a8_channel,torch.float16,28,128,7168,256,8,0,0,asm,10001+20001,432.1786 +gfx936,int8_w8a8_channel,torch.float16,32,128,7168,256,8,0,0,asm,10001+20001,459.5133 +gfx936,int8_w8a8_channel,torch.float16,34,128,7168,256,8,0,0,asm,10000+20001,458.9208 +gfx936,int8_w8a8_channel,torch.float16,36,128,7168,256,8,0,0,asm,10001+20001,476.6135 +gfx936,int8_w8a8_channel,torch.float16,40,128,7168,256,8,0,0,asm,10001+20001,492.2177 +gfx936,int8_w8a8_channel,torch.float16,44,128,7168,256,8,0,0,asm,10001+20001,516.9756 +gfx936,int8_w8a8_channel,torch.float16,48,128,7168,256,8,0,0,asm,10001+20001,527.0556 +gfx936,int8_w8a8_channel,torch.float16,56,128,7168,256,8,0,0,asm,10001+20001,554.8452 +gfx936,int8_w8a8_channel,torch.float16,64,128,7168,256,8,0,0,asm,10001+20001,576.5378 +gfx936,int8_w8a8_channel,torch.float16,68,128,7168,256,8,0,0,asm,10000+20001,581.8515 +gfx936,int8_w8a8_channel,torch.float16,72,128,7168,256,8,0,0,asm,10000+20001,595.1904 +gfx936,int8_w8a8_channel,torch.float16,80,128,7168,256,8,0,0,asm,10001+20001,611.2831 +gfx936,int8_w8a8_channel,torch.float16,88,128,7168,256,8,0,0,asm,10001+20001,635.0894 +gfx936,int8_w8a8_channel,torch.float16,96,128,7168,256,8,0,0,asm,10001+20001,644.9505 +gfx936,int8_w8a8_channel,torch.float16,104,128,7168,256,8,0,0,asm,10000+20001,650.0705 +gfx936,int8_w8a8_channel,torch.float16,112,128,7168,256,8,0,0,asm,10001+20001,657.5484 +gfx936,int8_w8a8_channel,torch.float16,128,128,7168,256,8,0,0,asm,10001+20001,683.5779 +gfx936,int8_w8a8_channel,torch.float16,144,128,7168,256,8,0,0,asm,10001+20001,686.4916 +gfx936,int8_w8a8_channel,torch.float16,160,128,7168,256,8,0,0,asm,10001+20001,696.2768 +gfx936,int8_w8a8_channel,torch.float16,192,128,7168,256,8,0,0,asm,10001+20001,709.7168 +gfx936,int8_w8a8_channel,torch.float16,224,128,7168,256,8,0,0,asm,10001+20001,740.2179 +gfx936,int8_w8a8_channel,torch.float16,256,128,7168,256,8,0,0,asm,10001+20001,745.6664 +gfx936,int8_w8a8_channel,torch.float16,320,128,7168,256,8,0,0,asm,11001+21001,775.7295 +gfx936,int8_w8a8_channel,torch.float16,384,128,7168,256,8,0,0,asm,11001+21001,795.1484 +gfx936,int8_w8a8_channel,torch.float16,448,128,7168,256,8,0,0,asm,11001+21001,825.2537 +gfx936,int8_w8a8_channel,torch.float16,512,128,7168,256,8,0,0,asm,11001+21001,846.2138 +gfx936,int8_w8a8_channel,torch.float16,576,128,7168,256,8,0,0,asm,11001+21001,871.5864 +gfx936,int8_w8a8_channel,torch.float16,640,128,7168,256,8,0,0,asm,11001+21001,892.3527 +gfx936,int8_w8a8_channel,torch.float16,704,128,7168,256,8,0,0,asm,11006+21001,916.4117 +gfx936,int8_w8a8_channel,torch.float16,768,128,7168,256,8,0,0,asm,11006+21001,938.736 +gfx936,int8_w8a8_channel,torch.float16,832,128,7168,256,8,0,0,asm,11007+21001,965.1192 +gfx936,int8_w8a8_channel,torch.float16,896,128,7168,256,8,0,0,asm,11007+21001,981.9782 +gfx936,int8_w8a8_channel,torch.float16,960,128,7168,256,8,0,0,asm,11006+21001,1012.9761 +gfx936,int8_w8a8_channel,torch.float16,1024,128,7168,256,8,0,0,asm,11007+21001,1019.0813 +gfx936,int8_w8a8_channel,torch.float16,1152,128,7168,256,8,0,0,asm,11005+21001,1077.9614 +gfx936,int8_w8a8_channel,torch.float16,1280,128,7168,256,8,0,0,asm,12004+22001,1130.8541 +gfx936,int8_w8a8_channel,torch.float16,1408,128,7168,256,8,0,0,asm,12004+22001,1175.7888 +gfx936,int8_w8a8_channel,torch.float16,1536,128,7168,256,8,0,0,asm,12004+22001,1212.8668 +gfx936,int8_w8a8_channel,torch.float16,1664,128,7168,256,8,0,0,asm,12004+22001,1250.3489 +gfx936,int8_w8a8_channel,torch.float16,1792,128,7168,256,8,0,0,asm,11005+21001,1300.5047 +gfx936,int8_w8a8_channel,torch.float16,1920,128,7168,256,8,0,0,asm,12004+22001,1373.9701 +gfx936,int8_w8a8_channel,torch.float16,2048,128,7168,256,8,0,0,asm,12005+22001,1414.9982 +gfx936,int8_w8a8_channel,torch.float16,2304,128,7168,256,8,0,0,asm,12005+22001,1543.9078 +gfx936,int8_w8a8_channel,torch.float16,2560,128,7168,256,8,0,0,asm,13000+23001,1663.1165 +gfx936,int8_w8a8_channel,torch.float16,2816,128,7168,256,8,0,0,asm,13000+23001,1722.0387 +gfx936,int8_w8a8_channel,torch.float16,3072,128,7168,256,8,0,0,asm,12005+22001,1793.9798 +gfx936,int8_w8a8_channel,torch.float16,3328,128,7168,256,8,0,0,asm,12001+22001,1865.4831 +gfx936,int8_w8a8_channel,torch.float16,3584,128,7168,256,8,0,0,asm,12001+22001,1932.7758 +gfx936,int8_w8a8_channel,torch.float16,3840,128,7168,256,8,0,0,asm,12001+22001,2087.9508 +gfx936,int8_w8a8_channel,torch.float16,4096,128,7168,256,8,0,0,asm,12001+22001,2196.8436 +gfx936,int8_w8a8_channel,torch.float16,4608,128,7168,256,8,0,0,asm,12001+22001,2510.9156 +gfx936,int8_w8a8_channel,torch.float16,5120,128,7168,256,8,0,0,asm,12001+22001,2654.6632 +gfx936,int8_w8a8_channel,torch.float16,5632,128,7168,256,8,0,0,asm,12003+22001,2828.9119 +gfx936,int8_w8a8_channel,torch.float16,6144,128,7168,256,8,0,0,asm,13001+23001,3011.9016 +gfx936,int8_w8a8_channel,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23001,3129.9651 +gfx936,int8_w8a8_channel,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23001,3264.7526 +gfx936,int8_w8a8_channel,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23001,3438.3445 +gfx936,int8_w8a8_channel,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23001,3778.766 +gfx936,int8_w8a8_channel,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23001,4631.5252 +gfx936,int8_w8a8_channel,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23001,5377.9096 +gfx936,int8_w8a8_channel,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23001,6130.6435 +gfx936,int8_w8a8_channel,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23001,7020.1861 +gfx936,int8_w8a8_channel,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23001,7488.0522 +gfx936,int8_w8a8_channel,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23001,10191.3578 +gfx936,int8_w8a8_channel,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23001,13416.1804 +gfx936,int8_w8a8_channel,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15540.521 +gfx936,int8_w8a8_channel,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23101,19230.8184 +gfx936,int8_w8a8_channel,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23001,22907.4573 +gfx936,int8_w8a8_channel,torch.float16,1,256,7168,256,8,0,0,asm,10001+20001,78.2921 +gfx936,int8_w8a8_channel,torch.float16,2,256,7168,256,8,0,0,asm,10002+20001,118.991 +gfx936,int8_w8a8_channel,torch.float16,3,256,7168,256,8,0,0,asm,10001+20001,162.9827 +gfx936,int8_w8a8_channel,torch.float16,4,256,7168,256,8,0,0,asm,10001+20001,206.8144 +gfx936,int8_w8a8_channel,torch.float16,5,256,7168,256,8,0,0,asm,10001+20001,237.9892 +gfx936,int8_w8a8_channel,torch.float16,6,256,7168,256,8,0,0,asm,10001+20001,273.4584 +gfx936,int8_w8a8_channel,torch.float16,7,256,7168,256,8,0,0,asm,10001+20001,304.4734 +gfx936,int8_w8a8_channel,torch.float16,8,256,7168,256,8,0,0,asm,10001+20001,327.3113 +gfx936,int8_w8a8_channel,torch.float16,9,256,7168,256,8,0,0,asm,10001+20001,361.2314 +gfx936,int8_w8a8_channel,torch.float16,10,256,7168,256,8,0,0,asm,10000+20001,388.6503 +gfx936,int8_w8a8_channel,torch.float16,11,256,7168,256,8,0,0,asm,10001+20001,417.4167 +gfx936,int8_w8a8_channel,torch.float16,12,256,7168,256,8,0,0,asm,10001+20001,436.6167 +gfx936,int8_w8a8_channel,torch.float16,13,256,7168,256,8,0,0,asm,10000+20001,459.8252 +gfx936,int8_w8a8_channel,torch.float16,14,256,7168,256,8,0,0,asm,10000+20001,493.1473 +gfx936,int8_w8a8_channel,torch.float16,15,256,7168,256,8,0,0,asm,10001+20001,512.6842 +gfx936,int8_w8a8_channel,torch.float16,16,256,7168,256,8,0,0,asm,10000+20001,528.9031 +gfx936,int8_w8a8_channel,torch.float16,17,256,7168,256,8,0,0,asm,10000+20001,544.2181 +gfx936,int8_w8a8_channel,torch.float16,18,256,7168,256,8,0,0,asm,10001+20001,569.5065 +gfx936,int8_w8a8_channel,torch.float16,20,256,7168,256,8,0,0,asm,10000+20001,618.4751 +gfx936,int8_w8a8_channel,torch.float16,24,256,7168,256,8,0,0,asm,10000+20001,709.7258 +gfx936,int8_w8a8_channel,torch.float16,28,256,7168,256,8,0,0,asm,10000+20001,809.2237 +gfx936,int8_w8a8_channel,torch.float16,32,256,7168,256,8,0,0,asm,10000+20001,864.8196 +gfx936,int8_w8a8_channel,torch.float16,34,256,7168,256,8,0,0,asm,10000+20001,879.0063 +gfx936,int8_w8a8_channel,torch.float16,36,256,7168,256,8,0,0,asm,10010+20001,908.7073 +gfx936,int8_w8a8_channel,torch.float16,40,256,7168,256,8,0,0,asm,10000+20000,952.076 +gfx936,int8_w8a8_channel,torch.float16,44,256,7168,256,8,0,0,asm,11000+21000,996.2529 +gfx936,int8_w8a8_channel,torch.float16,48,256,7168,256,8,0,0,asm,10000+20000,1022.4591 +gfx936,int8_w8a8_channel,torch.float16,56,256,7168,256,8,0,0,asm,11000+21000,1080.8678 +gfx936,int8_w8a8_channel,torch.float16,64,256,7168,256,8,0,0,asm,11000+21000,1122.788 +gfx936,int8_w8a8_channel,torch.float16,68,256,7168,256,8,0,0,asm,11000+21000,1136.0427 +gfx936,int8_w8a8_channel,torch.float16,72,256,7168,256,8,0,0,asm,11000+21000,1163.3776 +gfx936,int8_w8a8_channel,torch.float16,80,256,7168,256,8,0,0,asm,11000+21000,1201.1039 +gfx936,int8_w8a8_channel,torch.float16,88,256,7168,256,8,0,0,asm,11000+21000,1241.1966 +gfx936,int8_w8a8_channel,torch.float16,96,256,7168,256,8,0,0,asm,11000+21000,1260.2618 +gfx936,int8_w8a8_channel,torch.float16,104,256,7168,256,8,0,0,asm,11000+21000,1273.7944 +gfx936,int8_w8a8_channel,torch.float16,112,256,7168,256,8,0,0,asm,11000+21001,1286.4258 +gfx936,int8_w8a8_channel,torch.float16,128,256,7168,256,8,0,0,asm,11000+21000,1321.8448 +gfx936,int8_w8a8_channel,torch.float16,144,256,7168,256,8,0,0,asm,10010+20001,1338.8132 +gfx936,int8_w8a8_channel,torch.float16,160,256,7168,256,8,0,0,asm,11000+21000,1357.4742 +gfx936,int8_w8a8_channel,torch.float16,192,256,7168,256,8,0,0,asm,10010+20001,1372.8173 +gfx936,int8_w8a8_channel,torch.float16,224,256,7168,256,8,0,0,asm,10010+20001,1399.2174 +gfx936,int8_w8a8_channel,torch.float16,256,256,7168,256,8,0,0,asm,10010+20001,1403.5626 +gfx936,int8_w8a8_channel,torch.float16,320,256,7168,256,8,0,0,asm,10010+20001,1447.9416 +gfx936,int8_w8a8_channel,torch.float16,384,256,7168,256,8,0,0,asm,11000+21001,1475.1079 +gfx936,int8_w8a8_channel,torch.float16,448,256,7168,256,8,0,0,asm,10013+20001,1487.9921 +gfx936,int8_w8a8_channel,torch.float16,512,256,7168,256,8,0,0,asm,10013+20001,1500.4553 +gfx936,int8_w8a8_channel,torch.float16,576,256,7168,256,8,0,0,asm,10013+20001,1573.1711 +gfx936,int8_w8a8_channel,torch.float16,640,256,7168,256,8,0,0,asm,10011+20001,1601.2554 +gfx936,int8_w8a8_channel,torch.float16,704,256,7168,256,8,0,0,asm,11001+21001,1633.8197 +gfx936,int8_w8a8_channel,torch.float16,768,256,7168,256,8,0,0,asm,10011+20001,1668.1186 +gfx936,int8_w8a8_channel,torch.float16,832,256,7168,256,8,0,0,asm,11006+21001,1701.5249 +gfx936,int8_w8a8_channel,torch.float16,896,256,7168,256,8,0,0,asm,11006+21001,1721.0365 +gfx936,int8_w8a8_channel,torch.float16,960,256,7168,256,8,0,0,asm,11007+21001,1727.6049 +gfx936,int8_w8a8_channel,torch.float16,1024,256,7168,256,8,0,0,asm,11007+21001,1738.0049 +gfx936,int8_w8a8_channel,torch.float16,1152,256,7168,256,8,0,0,asm,12004+22001,1793.9292 +gfx936,int8_w8a8_channel,torch.float16,1280,256,7168,256,8,0,0,asm,11005+21001,1868.9946 +gfx936,int8_w8a8_channel,torch.float16,1408,256,7168,256,8,0,0,asm,11005+21001,1902.3756 +gfx936,int8_w8a8_channel,torch.float16,1536,256,7168,256,8,0,0,asm,11005+21001,1928.0683 +gfx936,int8_w8a8_channel,torch.float16,1664,256,7168,256,8,0,0,asm,11005+21001,2025.9967 +gfx936,int8_w8a8_channel,torch.float16,1792,256,7168,256,8,0,0,asm,12004+22001,2038.5442 +gfx936,int8_w8a8_channel,torch.float16,1920,256,7168,256,8,0,0,asm,12000+22001,2154.157 +gfx936,int8_w8a8_channel,torch.float16,2048,256,7168,256,8,0,0,asm,13000+23001,2193.4244 +gfx936,int8_w8a8_channel,torch.float16,2304,256,7168,256,8,0,0,asm,13000+23001,2250.4602 +gfx936,int8_w8a8_channel,torch.float16,2560,256,7168,256,8,0,0,asm,13000+23001,2375.4793 +gfx936,int8_w8a8_channel,torch.float16,2816,256,7168,256,8,0,0,asm,13000+23001,2459.2772 +gfx936,int8_w8a8_channel,torch.float16,3072,256,7168,256,8,0,0,asm,13000+23001,2531.6057 +gfx936,int8_w8a8_channel,torch.float16,3328,256,7168,256,8,0,0,asm,13000+23001,2618.5448 +gfx936,int8_w8a8_channel,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23001,2755.9934 +gfx936,int8_w8a8_channel,torch.float16,3840,256,7168,256,8,0,0,asm,12001+22001,2926.2758 +gfx936,int8_w8a8_channel,torch.float16,4096,256,7168,256,8,0,0,asm,13001+23001,3090.1328 +gfx936,int8_w8a8_channel,torch.float16,4608,256,7168,256,8,0,0,asm,12001+22001,3489.0469 +gfx936,int8_w8a8_channel,torch.float16,5120,256,7168,256,8,0,0,asm,13001+23001,3676.4831 +gfx936,int8_w8a8_channel,torch.float16,5632,256,7168,256,8,0,0,asm,13001+23001,3816.7864 +gfx936,int8_w8a8_channel,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23001,3877.2412 +gfx936,int8_w8a8_channel,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23001,4027.0604 +gfx936,int8_w8a8_channel,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23001,4210.3786 +gfx936,int8_w8a8_channel,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23001,4462.4463 +gfx936,int8_w8a8_channel,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23001,4854.6321 +gfx936,int8_w8a8_channel,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23001,6054.3389 +gfx936,int8_w8a8_channel,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23001,6973.4562 +gfx936,int8_w8a8_channel,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23001,8029.3144 +gfx936,int8_w8a8_channel,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23001,8966.4444 +gfx936,int8_w8a8_channel,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23001,9653.696 +gfx936,int8_w8a8_channel,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23001,13175.2086 +gfx936,int8_w8a8_channel,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23001,17363.5772 +gfx936,int8_w8a8_channel,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23001,21369.1494 +gfx936,int8_w8a8_channel,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23001,25515.9519 +gfx938,int8_w8a8_channel,torch.float16,1,128,7168,256,8,0,0,asm,10001+20101,63.2618 +gfx938,int8_w8a8_channel,torch.float16,2,128,7168,256,8,0,0,asm,10002+20102,75.6895 +gfx938,int8_w8a8_channel,torch.float16,3,128,7168,256,8,0,0,asm,10008+20102,87.3309 +gfx938,int8_w8a8_channel,torch.float16,4,128,7168,256,8,0,0,asm,10008+20102,104.9443 +gfx938,int8_w8a8_channel,torch.float16,5,128,7168,256,8,0,0,asm,10011+20102,131.5369 +gfx938,int8_w8a8_channel,torch.float16,6,128,7168,256,8,0,0,asm,10011+20102,131.738 +gfx938,int8_w8a8_channel,torch.float16,7,128,7168,256,8,0,0,asm,10011+20102,139.7052 +gfx938,int8_w8a8_channel,torch.float16,8,128,7168,256,8,0,0,asm,10011+20102,147.422 +gfx938,int8_w8a8_channel,torch.float16,9,128,7168,256,8,0,0,asm,10011+20102,160.4928 +gfx938,int8_w8a8_channel,torch.float16,10,128,7168,256,8,0,0,asm,10013+20102,171.5614 +gfx938,int8_w8a8_channel,torch.float16,11,128,7168,256,8,0,0,asm,10002+20102,201.6002 +gfx938,int8_w8a8_channel,torch.float16,12,128,7168,256,8,0,0,asm,10002+20102,204.5267 +gfx938,int8_w8a8_channel,torch.float16,13,128,7168,256,8,0,0,asm,10002+20102,209.5141 +gfx938,int8_w8a8_channel,torch.float16,14,128,7168,256,8,0,0,asm,10008+20102,219.6596 +gfx938,int8_w8a8_channel,torch.float16,15,128,7168,256,8,0,0,asm,10008+20102,222.4576 +gfx938,int8_w8a8_channel,torch.float16,16,128,7168,256,8,0,0,asm,10008+20102,226.195 +gfx938,int8_w8a8_channel,torch.float16,17,128,7168,256,8,0,0,asm,10009+20102,228.1325 +gfx938,int8_w8a8_channel,torch.float16,18,128,7168,256,8,0,0,asm,10008+20102,236.953 +gfx938,int8_w8a8_channel,torch.float16,20,128,7168,256,8,0,0,asm,10011+20102,258.4297 +gfx938,int8_w8a8_channel,torch.float16,24,128,7168,256,8,0,0,asm,10012+20102,281.8882 +gfx938,int8_w8a8_channel,torch.float16,28,128,7168,256,8,0,0,asm,10002+20102,335.2694 +gfx938,int8_w8a8_channel,torch.float16,32,128,7168,256,8,0,0,asm,10002+20102,347.0314 +gfx938,int8_w8a8_channel,torch.float16,34,128,7168,256,8,0,0,asm,10002+20102,340.4068 +gfx938,int8_w8a8_channel,torch.float16,36,128,7168,256,8,0,0,asm,10002+20102,350.6017 +gfx938,int8_w8a8_channel,torch.float16,40,128,7168,256,8,0,0,asm,10002+20102,371.7133 +gfx938,int8_w8a8_channel,torch.float16,44,128,7168,256,8,0,0,asm,10002+20102,381.5184 +gfx938,int8_w8a8_channel,torch.float16,48,128,7168,256,8,0,0,asm,10002+20102,387.3112 +gfx938,int8_w8a8_channel,torch.float16,56,128,7168,256,8,0,0,asm,10013+20102,404.4723 +gfx938,int8_w8a8_channel,torch.float16,64,128,7168,256,8,0,0,asm,10002+20102,426.9911 +gfx938,int8_w8a8_channel,torch.float16,68,128,7168,256,8,0,0,asm,10001+20102,446.2539 +gfx938,int8_w8a8_channel,torch.float16,72,128,7168,256,8,0,0,asm,10001+20102,455.8723 +gfx938,int8_w8a8_channel,torch.float16,80,128,7168,256,8,0,0,asm,10002+20101,460.7386 +gfx938,int8_w8a8_channel,torch.float16,88,128,7168,256,8,0,0,asm,10002+20102,462.5899 +gfx938,int8_w8a8_channel,torch.float16,96,128,7168,256,8,0,0,asm,10002+20102,462.8021 +gfx938,int8_w8a8_channel,torch.float16,104,128,7168,256,8,0,0,asm,10002+20102,478.4371 +gfx938,int8_w8a8_channel,torch.float16,112,128,7168,256,8,0,0,asm,10002+20102,481.4893 +gfx938,int8_w8a8_channel,torch.float16,128,128,7168,256,8,0,0,asm,10002+20102,487.6923 +gfx938,int8_w8a8_channel,torch.float16,144,128,7168,256,8,0,0,asm,10002+20102,509.8789 +gfx938,int8_w8a8_channel,torch.float16,160,128,7168,256,8,0,0,asm,10002+20102,515.3517 +gfx938,int8_w8a8_channel,torch.float16,192,128,7168,256,8,0,0,asm,10002+20102,519.5833 +gfx938,int8_w8a8_channel,torch.float16,224,128,7168,256,8,0,0,asm,10002+20102,527.8664 +gfx938,int8_w8a8_channel,torch.float16,256,128,7168,256,8,0,0,asm,10002+20102,536.3813 +gfx938,int8_w8a8_channel,torch.float16,320,128,7168,256,8,0,0,asm,10002+20102,555.8232 +gfx938,int8_w8a8_channel,torch.float16,384,128,7168,256,8,0,0,asm,10012+20102,565.3493 +gfx938,int8_w8a8_channel,torch.float16,448,128,7168,256,8,0,0,asm,11005+21102,610.3871 +gfx938,int8_w8a8_channel,torch.float16,512,128,7168,256,8,0,0,asm,11005+21102,633.7347 +gfx938,int8_w8a8_channel,torch.float16,576,128,7168,256,8,0,0,asm,11005+21102,684.5777 +gfx938,int8_w8a8_channel,torch.float16,640,128,7168,256,8,0,0,asm,11005+21102,659.4904 +gfx938,int8_w8a8_channel,torch.float16,704,128,7168,256,8,0,0,asm,11005+21102,667.2115 +gfx938,int8_w8a8_channel,torch.float16,768,128,7168,256,8,0,0,asm,11003+21102,675.0842 +gfx938,int8_w8a8_channel,torch.float16,832,128,7168,256,8,0,0,asm,11005+21102,688.4298 +gfx938,int8_w8a8_channel,torch.float16,896,128,7168,256,8,0,0,asm,11005+21102,755.3507 +gfx938,int8_w8a8_channel,torch.float16,960,128,7168,256,8,0,0,asm,11005+21102,763.7894 +gfx938,int8_w8a8_channel,torch.float16,1024,128,7168,256,8,0,0,asm,11005+21102,841.7542 +gfx938,int8_w8a8_channel,torch.float16,1152,128,7168,256,8,0,0,asm,12001+22001,977.5854 +gfx938,int8_w8a8_channel,torch.float16,1280,128,7168,256,8,0,0,asm,12005+22001,969.4833 +gfx938,int8_w8a8_channel,torch.float16,1408,128,7168,256,8,0,0,asm,12005+22001,1002.1966 +gfx938,int8_w8a8_channel,torch.float16,1536,128,7168,256,8,0,0,asm,12003+22001,1033.5221 +gfx938,int8_w8a8_channel,torch.float16,1664,128,7168,256,8,0,0,asm,12001+22001,1052.5143 +gfx938,int8_w8a8_channel,torch.float16,1792,128,7168,256,8,0,0,asm,12001+22001,1151.6113 +gfx938,int8_w8a8_channel,torch.float16,1920,128,7168,256,8,0,0,asm,12003+22101,1179.0246 +gfx938,int8_w8a8_channel,torch.float16,2048,128,7168,256,8,0,0,asm,12001+22101,1328.5704 +gfx938,int8_w8a8_channel,torch.float16,2304,128,7168,256,8,0,0,asm,13001+23001,1517.2547 +gfx938,int8_w8a8_channel,torch.float16,2560,128,7168,256,8,0,0,asm,13001+23001,1572.8933 +gfx938,int8_w8a8_channel,torch.float16,2816,128,7168,256,8,0,0,asm,13001+23001,1606.6615 +gfx938,int8_w8a8_channel,torch.float16,3072,128,7168,256,8,0,0,asm,13001+23001,1634.9778 +gfx938,int8_w8a8_channel,torch.float16,3328,128,7168,256,8,0,0,asm,13001+23001,1700.3595 +gfx938,int8_w8a8_channel,torch.float16,3584,128,7168,256,8,0,0,asm,13001+23001,1774.1847 +gfx938,int8_w8a8_channel,torch.float16,3840,128,7168,256,8,0,0,asm,13001+23001,1987.5334 +gfx938,int8_w8a8_channel,torch.float16,4096,128,7168,256,8,0,0,asm,12001+22101,2207.8893 +gfx938,int8_w8a8_channel,torch.float16,4608,128,7168,256,8,0,0,asm,12001+22101,2549.7134 +gfx938,int8_w8a8_channel,torch.float16,5120,128,7168,256,8,0,0,asm,12001+22101,2702.7999 +gfx938,int8_w8a8_channel,torch.float16,5632,128,7168,256,8,0,0,asm,12001+22101,2874.6924 +gfx938,int8_w8a8_channel,torch.float16,6144,128,7168,256,8,0,0,asm,13001+23101,3113.1382 +gfx938,int8_w8a8_channel,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23100,3219.0612 +gfx938,int8_w8a8_channel,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23101,3310.4812 +gfx938,int8_w8a8_channel,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23101,3438.692 +gfx938,int8_w8a8_channel,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23101,3769.0124 +gfx938,int8_w8a8_channel,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23101,4589.1196 +gfx938,int8_w8a8_channel,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23101,5414.4453 +gfx938,int8_w8a8_channel,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23101,6272.9851 +gfx938,int8_w8a8_channel,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23101,6928.7073 +gfx938,int8_w8a8_channel,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23101,7446.1985 +gfx938,int8_w8a8_channel,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23101,10133.096 +gfx938,int8_w8a8_channel,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23101,13291.5275 +gfx938,int8_w8a8_channel,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15668.9581 +gfx938,int8_w8a8_channel,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23101,18765.4355 +gfx938,int8_w8a8_channel,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23101,22021.7106 +gfx938,int8_w8a8_channel,torch.float16,65536,128,7168,256,8,0,0,asm,13001+23101,25347.7545 +gfx938,int8_w8a8_channel,torch.float16,1,256,7168,256,8,0,0,asm,10002+20001,69.8059 +gfx938,int8_w8a8_channel,torch.float16,2,256,7168,256,8,0,0,asm,10008+20001,105.3216 +gfx938,int8_w8a8_channel,torch.float16,3,256,7168,256,8,0,0,asm,10011+20001,137.0302 +gfx938,int8_w8a8_channel,torch.float16,4,256,7168,256,8,0,0,asm,10011+20001,166.402 +gfx938,int8_w8a8_channel,torch.float16,5,256,7168,256,8,0,0,asm,10002+20001,199.3649 +gfx938,int8_w8a8_channel,torch.float16,6,256,7168,256,8,0,0,asm,10008+20001,216.123 +gfx938,int8_w8a8_channel,torch.float16,7,256,7168,256,8,0,0,asm,10008+20001,232.9755 +gfx938,int8_w8a8_channel,torch.float16,8,256,7168,256,8,0,0,asm,10011+20001,248.2603 +gfx938,int8_w8a8_channel,torch.float16,9,256,7168,256,8,0,0,asm,10011+20001,265.2203 +gfx938,int8_w8a8_channel,torch.float16,10,256,7168,256,8,0,0,asm,10011+20001,282.6291 +gfx938,int8_w8a8_channel,torch.float16,11,256,7168,256,8,0,0,asm,10002+20001,310.1867 +gfx938,int8_w8a8_channel,torch.float16,12,256,7168,256,8,0,0,asm,10002+20001,321.9316 +gfx938,int8_w8a8_channel,torch.float16,13,256,7168,256,8,0,0,asm,10008+20001,338.8822 +gfx938,int8_w8a8_channel,torch.float16,14,256,7168,256,8,0,0,asm,10002+20001,358.7315 +gfx938,int8_w8a8_channel,torch.float16,15,256,7168,256,8,0,0,asm,10011+20001,368.3992 +gfx938,int8_w8a8_channel,torch.float16,16,256,7168,256,8,0,0,asm,10011+20001,378.1179 +gfx938,int8_w8a8_channel,torch.float16,17,256,7168,256,8,0,0,asm,10011+20001,381.3561 +gfx938,int8_w8a8_channel,torch.float16,18,256,7168,256,8,0,0,asm,10011+20001,397.2116 +gfx938,int8_w8a8_channel,torch.float16,20,256,7168,256,8,0,0,asm,10002+20001,431.1681 +gfx938,int8_w8a8_channel,torch.float16,24,256,7168,256,8,0,0,asm,10011+20001,479.9186 +gfx938,int8_w8a8_channel,torch.float16,28,256,7168,256,8,0,0,asm,10002+20001,552.5507 +gfx938,int8_w8a8_channel,torch.float16,32,256,7168,256,8,0,0,asm,10011+20001,587.4382 +gfx938,int8_w8a8_channel,torch.float16,34,256,7168,256,8,0,0,asm,10011+20001,579.5075 +gfx938,int8_w8a8_channel,torch.float16,36,256,7168,256,8,0,0,asm,10011+20001,600.686 +gfx938,int8_w8a8_channel,torch.float16,40,256,7168,256,8,0,0,asm,10002+20001,623.1973 +gfx938,int8_w8a8_channel,torch.float16,44,256,7168,256,8,0,0,asm,10002+20001,651.7723 +gfx938,int8_w8a8_channel,torch.float16,48,256,7168,256,8,0,0,asm,10002+20001,663.6823 +gfx938,int8_w8a8_channel,torch.float16,56,256,7168,256,8,0,0,asm,10002+20001,699.227 +gfx938,int8_w8a8_channel,torch.float16,64,256,7168,256,8,0,0,asm,10002+20001,718.8464 +gfx938,int8_w8a8_channel,torch.float16,68,256,7168,256,8,0,0,asm,10002+20001,727.4162 +gfx938,int8_w8a8_channel,torch.float16,72,256,7168,256,8,0,0,asm,10002+20000,752.4242 +gfx938,int8_w8a8_channel,torch.float16,80,256,7168,256,8,0,0,asm,10002+20001,798.6361 +gfx938,int8_w8a8_channel,torch.float16,88,256,7168,256,8,0,0,asm,10011+20001,803.9207 +gfx938,int8_w8a8_channel,torch.float16,96,256,7168,256,8,0,0,asm,10011+20001,797.7572 +gfx938,int8_w8a8_channel,torch.float16,104,256,7168,256,8,0,0,asm,10002+20001,817.8816 +gfx938,int8_w8a8_channel,torch.float16,112,256,7168,256,8,0,0,asm,10011+20001,819.8212 +gfx938,int8_w8a8_channel,torch.float16,128,256,7168,256,8,0,0,asm,10002+20001,836.1965 +gfx938,int8_w8a8_channel,torch.float16,144,256,7168,256,8,0,0,asm,10002+20001,844.6419 +gfx938,int8_w8a8_channel,torch.float16,160,256,7168,256,8,0,0,asm,10002+20001,861.065 +gfx938,int8_w8a8_channel,torch.float16,192,256,7168,256,8,0,0,asm,10002+20001,870.0983 +gfx938,int8_w8a8_channel,torch.float16,224,256,7168,256,8,0,0,asm,10002+20001,904.8108 +gfx938,int8_w8a8_channel,torch.float16,256,256,7168,256,8,0,0,asm,10002+20001,900.3962 +gfx938,int8_w8a8_channel,torch.float16,320,256,7168,256,8,0,0,asm,10011+20001,938.3914 +gfx938,int8_w8a8_channel,torch.float16,384,256,7168,256,8,0,0,asm,10011+20001,950.1246 +gfx938,int8_w8a8_channel,torch.float16,448,256,7168,256,8,0,0,asm,11005+21001,991.3086 +gfx938,int8_w8a8_channel,torch.float16,512,256,7168,256,8,0,0,asm,11005+21001,1024.873 +gfx938,int8_w8a8_channel,torch.float16,576,256,7168,256,8,0,0,asm,11003+21001,1085.3904 +gfx938,int8_w8a8_channel,torch.float16,640,256,7168,256,8,0,0,asm,11005+21001,1064.9267 +gfx938,int8_w8a8_channel,torch.float16,704,256,7168,256,8,0,0,asm,11005+21001,1082.1236 +gfx938,int8_w8a8_channel,torch.float16,768,256,7168,256,8,0,0,asm,11003+21001,1086.2148 +gfx938,int8_w8a8_channel,torch.float16,832,256,7168,256,8,0,0,asm,11005+21001,1118.0076 +gfx938,int8_w8a8_channel,torch.float16,896,256,7168,256,8,0,0,asm,11005+21001,1177.0868 +gfx938,int8_w8a8_channel,torch.float16,960,256,7168,256,8,0,0,asm,11005+21001,1261.5258 +gfx938,int8_w8a8_channel,torch.float16,1024,256,7168,256,8,0,0,asm,12001+22001,1321.1087 +gfx938,int8_w8a8_channel,torch.float16,1152,256,7168,256,8,0,0,asm,12001+22001,1378.0109 +gfx938,int8_w8a8_channel,torch.float16,1280,256,7168,256,8,0,0,asm,12001+22001,1392.1021 +gfx938,int8_w8a8_channel,torch.float16,1408,256,7168,256,8,0,0,asm,12001+22001,1430.3588 +gfx938,int8_w8a8_channel,torch.float16,1536,256,7168,256,8,0,0,asm,12001+22001,1464.291 +gfx938,int8_w8a8_channel,torch.float16,1664,256,7168,256,8,0,0,asm,12003+22001,1505.5708 +gfx938,int8_w8a8_channel,torch.float16,1792,256,7168,256,8,0,0,asm,12001+22001,1589.9691 +gfx938,int8_w8a8_channel,torch.float16,1920,256,7168,256,8,0,0,asm,12001+22001,1632.3197 +gfx938,int8_w8a8_channel,torch.float16,2048,256,7168,256,8,0,0,asm,12001+22001,1866.2468 +gfx938,int8_w8a8_channel,torch.float16,2304,256,7168,256,8,0,0,asm,13001+23001,2048.1579 +gfx938,int8_w8a8_channel,torch.float16,2560,256,7168,256,8,0,0,asm,13001+23001,2092.4777 +gfx938,int8_w8a8_channel,torch.float16,2816,256,7168,256,8,0,0,asm,13001+23001,2139.2016 +gfx938,int8_w8a8_channel,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23001,2186.3933 +gfx938,int8_w8a8_channel,torch.float16,3328,256,7168,256,8,0,0,asm,13001+23001,2256.4647 +gfx938,int8_w8a8_channel,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23001,2336.6121 +gfx938,int8_w8a8_channel,torch.float16,3840,256,7168,256,8,0,0,asm,13001+23001,2576.566 +gfx938,int8_w8a8_channel,torch.float16,4096,256,7168,256,8,0,0,asm,13001+23001,2972.2115 +gfx938,int8_w8a8_channel,torch.float16,4608,256,7168,256,8,0,0,asm,12001+22001,3615.1791 +gfx938,int8_w8a8_channel,torch.float16,5120,256,7168,256,8,0,0,asm,12001+22001,3819.4244 +gfx938,int8_w8a8_channel,torch.float16,5632,256,7168,256,8,0,0,asm,12001+22001,4030.1803 +gfx938,int8_w8a8_channel,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23001,4172.1696 +gfx938,int8_w8a8_channel,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23001,4255.9334 +gfx938,int8_w8a8_channel,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23001,4370.7083 +gfx938,int8_w8a8_channel,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23001,4623.5482 +gfx938,int8_w8a8_channel,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23001,5132.8724 +gfx938,int8_w8a8_channel,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23001,6291.0284 +gfx938,int8_w8a8_channel,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23001,7314.9996 +gfx938,int8_w8a8_channel,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23001,8463.126 +gfx938,int8_w8a8_channel,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23001,9513.7307 +gfx938,int8_w8a8_channel,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23001,10340.0984 +gfx938,int8_w8a8_channel,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23001,13947.7674 +gfx938,int8_w8a8_channel,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23001,18301.4314 +gfx938,int8_w8a8_channel,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23001,22619.4892 +gfx938,int8_w8a8_channel,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23001,26910.4336 +gfx938,int8_w8a8_channel,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23001,31283.1318 +gfx938,int8_w8a8_channel,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23001,35592.7466 +gfx938,f8_w8a8_channel,torch.float16,1,128,7168,256,8,0,0,asm,10000+20101,61.4201 +gfx938,f8_w8a8_channel,torch.float16,2,128,7168,256,8,0,0,asm,10002+20102,74.3393 +gfx938,f8_w8a8_channel,torch.float16,3,128,7168,256,8,0,0,asm,10009+20102,87.5437 +gfx938,f8_w8a8_channel,torch.float16,4,128,7168,256,8,0,0,asm,10009+20102,104.5218 +gfx938,f8_w8a8_channel,torch.float16,5,128,7168,256,8,0,0,asm,10011+20102,131.6919 +gfx938,f8_w8a8_channel,torch.float16,6,128,7168,256,8,0,0,asm,10011+20102,136.3617 +gfx938,f8_w8a8_channel,torch.float16,7,128,7168,256,8,0,0,asm,10011+20102,140.4929 +gfx938,f8_w8a8_channel,torch.float16,8,128,7168,256,8,0,0,asm,10011+20102,146.7534 +gfx938,f8_w8a8_channel,torch.float16,9,128,7168,256,8,0,0,asm,10011+20102,162.0682 +gfx938,f8_w8a8_channel,torch.float16,10,128,7168,256,8,0,0,asm,10013+20102,174.0107 +gfx938,f8_w8a8_channel,torch.float16,11,128,7168,256,8,0,0,asm,10002+20102,203.5942 +gfx938,f8_w8a8_channel,torch.float16,12,128,7168,256,8,0,0,asm,10002+20102,207.1347 +gfx938,f8_w8a8_channel,torch.float16,13,128,7168,256,8,0,0,asm,10002+20102,209.305 +gfx938,f8_w8a8_channel,torch.float16,14,128,7168,256,8,0,0,asm,10008+20102,219.6352 +gfx938,f8_w8a8_channel,torch.float16,15,128,7168,256,8,0,0,asm,10008+20102,221.8342 +gfx938,f8_w8a8_channel,torch.float16,16,128,7168,256,8,0,0,asm,10009+20102,226.0433 +gfx938,f8_w8a8_channel,torch.float16,17,128,7168,256,8,0,0,asm,10008+20102,228.4813 +gfx938,f8_w8a8_channel,torch.float16,18,128,7168,256,8,0,0,asm,10002+20102,237.708 +gfx938,f8_w8a8_channel,torch.float16,20,128,7168,256,8,0,0,asm,10011+20102,259.1725 +gfx938,f8_w8a8_channel,torch.float16,24,128,7168,256,8,0,0,asm,10013+20102,281.3468 +gfx938,f8_w8a8_channel,torch.float16,28,128,7168,256,8,0,0,asm,10002+20102,336.0654 +gfx938,f8_w8a8_channel,torch.float16,32,128,7168,256,8,0,0,asm,10002+20102,349.288 +gfx938,f8_w8a8_channel,torch.float16,34,128,7168,256,8,0,0,asm,10002+20102,337.0594 +gfx938,f8_w8a8_channel,torch.float16,36,128,7168,256,8,0,0,asm,10002+20102,348.8132 +gfx938,f8_w8a8_channel,torch.float16,40,128,7168,256,8,0,0,asm,10002+20102,369.2685 +gfx938,f8_w8a8_channel,torch.float16,44,128,7168,256,8,0,0,asm,10002+20102,383.8079 +gfx938,f8_w8a8_channel,torch.float16,48,128,7168,256,8,0,0,asm,10002+20102,388.2017 +gfx938,f8_w8a8_channel,torch.float16,56,128,7168,256,8,0,0,asm,10002+20102,406.815 +gfx938,f8_w8a8_channel,torch.float16,64,128,7168,256,8,0,0,asm,10002+20102,431.9513 +gfx938,f8_w8a8_channel,torch.float16,68,128,7168,256,8,0,0,asm,10002+20102,465.6659 +gfx938,f8_w8a8_channel,torch.float16,72,128,7168,256,8,0,0,asm,10001+20001,467.1784 +gfx938,f8_w8a8_channel,torch.float16,80,128,7168,256,8,0,0,asm,10002+20102,454.265 +gfx938,f8_w8a8_channel,torch.float16,88,128,7168,256,8,0,0,asm,10002+20102,466.2786 +gfx938,f8_w8a8_channel,torch.float16,96,128,7168,256,8,0,0,asm,10002+20102,466.5357 +gfx938,f8_w8a8_channel,torch.float16,104,128,7168,256,8,0,0,asm,10002+20102,476.3777 +gfx938,f8_w8a8_channel,torch.float16,112,128,7168,256,8,0,0,asm,10002+20102,478.2608 +gfx938,f8_w8a8_channel,torch.float16,128,128,7168,256,8,0,0,asm,10002+20102,491.1387 +gfx938,f8_w8a8_channel,torch.float16,144,128,7168,256,8,0,0,asm,10002+20102,502.7711 +gfx938,f8_w8a8_channel,torch.float16,160,128,7168,256,8,0,0,asm,10002+20102,516.1437 +gfx938,f8_w8a8_channel,torch.float16,192,128,7168,256,8,0,0,asm,10002+20102,519.8894 +gfx938,f8_w8a8_channel,torch.float16,224,128,7168,256,8,0,0,asm,10002+20102,532.9433 +gfx938,f8_w8a8_channel,torch.float16,256,128,7168,256,8,0,0,asm,10002+20102,535.2698 +gfx938,f8_w8a8_channel,torch.float16,320,128,7168,256,8,0,0,asm,10002+20102,559.5246 +gfx938,f8_w8a8_channel,torch.float16,384,128,7168,256,8,0,0,asm,10011+20102,571.1548 +gfx938,f8_w8a8_channel,torch.float16,448,128,7168,256,8,0,0,asm,11005+21102,612.7505 +gfx938,f8_w8a8_channel,torch.float16,512,128,7168,256,8,0,0,asm,11007+21102,634.4969 +gfx938,f8_w8a8_channel,torch.float16,576,128,7168,256,8,0,0,asm,11007+21102,681.9113 +gfx938,f8_w8a8_channel,torch.float16,640,128,7168,256,8,0,0,asm,11005+21102,662.978 +gfx938,f8_w8a8_channel,torch.float16,704,128,7168,256,8,0,0,asm,11005+21102,672.4385 +gfx938,f8_w8a8_channel,torch.float16,768,128,7168,256,8,0,0,asm,11007+21102,690.7955 +gfx938,f8_w8a8_channel,torch.float16,832,128,7168,256,8,0,0,asm,11005+21102,695.1789 +gfx938,f8_w8a8_channel,torch.float16,896,128,7168,256,8,0,0,asm,11005+21102,753.6196 +gfx938,f8_w8a8_channel,torch.float16,960,128,7168,256,8,0,0,asm,11005+21102,763.1376 +gfx938,f8_w8a8_channel,torch.float16,1024,128,7168,256,8,0,0,asm,11003+21102,853.5372 +gfx938,f8_w8a8_channel,torch.float16,1152,128,7168,256,8,0,0,asm,12001+22001,975.9654 +gfx938,f8_w8a8_channel,torch.float16,1280,128,7168,256,8,0,0,asm,12001+22001,971.1162 +gfx938,f8_w8a8_channel,torch.float16,1408,128,7168,256,8,0,0,asm,12005+22001,1000.0597 +gfx938,f8_w8a8_channel,torch.float16,1536,128,7168,256,8,0,0,asm,12005+22001,1020.6874000000001 +gfx938,f8_w8a8_channel,torch.float16,1664,128,7168,256,8,0,0,asm,12003+22001,1054.1529 +gfx938,f8_w8a8_channel,torch.float16,1792,128,7168,256,8,0,0,asm,12001+22001,1156.2478 +gfx938,f8_w8a8_channel,torch.float16,1920,128,7168,256,8,0,0,asm,12001+22001,1199.8661 +gfx938,f8_w8a8_channel,torch.float16,2048,128,7168,256,8,0,0,asm,12001+22101,1344.1486 +gfx938,f8_w8a8_channel,torch.float16,2304,128,7168,256,8,0,0,asm,13001+23001,1508.7961 +gfx938,f8_w8a8_channel,torch.float16,2560,128,7168,256,8,0,0,asm,13001+23001,1546.9773 +gfx938,f8_w8a8_channel,torch.float16,2816,128,7168,256,8,0,0,asm,13001+23001,1592.0286 +gfx938,f8_w8a8_channel,torch.float16,3072,128,7168,256,8,0,0,asm,13001+23001,1634.9946 +gfx938,f8_w8a8_channel,torch.float16,3328,128,7168,256,8,0,0,asm,13001+23001,1695.0105 +gfx938,f8_w8a8_channel,torch.float16,3584,128,7168,256,8,0,0,asm,13001+23001,1754.4279 +gfx938,f8_w8a8_channel,torch.float16,3840,128,7168,256,8,0,0,asm,13001+23001,1963.7276 +gfx938,f8_w8a8_channel,torch.float16,4096,128,7168,256,8,0,0,asm,12001+22101,2231.6895 +gfx938,f8_w8a8_channel,torch.float16,4608,128,7168,256,8,0,0,asm,12003+22101,2601.7681 +gfx938,f8_w8a8_channel,torch.float16,5120,128,7168,256,8,0,0,asm,12001+22001,2735.0065 +gfx938,f8_w8a8_channel,torch.float16,5632,128,7168,256,8,0,0,asm,12001+22001,2902.2632 +gfx938,f8_w8a8_channel,torch.float16,6144,128,7168,256,8,0,0,asm,13001+23001,3103.9732 +gfx938,f8_w8a8_channel,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23001,3204.7276 +gfx938,f8_w8a8_channel,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23001,3293.3468 +gfx938,f8_w8a8_channel,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23001,3450.7 +gfx938,f8_w8a8_channel,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23101,3826.76 +gfx938,f8_w8a8_channel,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23101,4667.9011 +gfx938,f8_w8a8_channel,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23101,5498.0926 +gfx938,f8_w8a8_channel,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23001,6370.1259 +gfx938,f8_w8a8_channel,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23101,7049.5697 +gfx938,f8_w8a8_channel,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23101,7572.7029 +gfx938,f8_w8a8_channel,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23101,10339.622 +gfx938,f8_w8a8_channel,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23101,13538.5271 +gfx938,f8_w8a8_channel,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15969.9307 +gfx938,f8_w8a8_channel,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23101,19164.5979 +gfx938,f8_w8a8_channel,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23101,22502.1147 +gfx938,f8_w8a8_channel,torch.float16,65536,128,7168,256,8,0,0,asm,13001+23101,25939.8595 +gfx938,f8_w8a8_channel,torch.float16,1,256,7168,256,8,0,0,asm,10002+20001,69.9253 +gfx938,f8_w8a8_channel,torch.float16,2,256,7168,256,8,0,0,asm,10008+20001,103.5412 +gfx938,f8_w8a8_channel,torch.float16,3,256,7168,256,8,0,0,asm,10011+20001,138.3292 +gfx938,f8_w8a8_channel,torch.float16,4,256,7168,256,8,0,0,asm,10011+20001,165.8163 +gfx938,f8_w8a8_channel,torch.float16,5,256,7168,256,8,0,0,asm,10001+20001,199.2916 +gfx938,f8_w8a8_channel,torch.float16,6,256,7168,256,8,0,0,asm,10008+20001,215.6812 +gfx938,f8_w8a8_channel,torch.float16,7,256,7168,256,8,0,0,asm,10008+20001,232.7508 +gfx938,f8_w8a8_channel,torch.float16,8,256,7168,256,8,0,0,asm,10011+20001,247.9325 +gfx938,f8_w8a8_channel,torch.float16,9,256,7168,256,8,0,0,asm,10011+20001,262.6708 +gfx938,f8_w8a8_channel,torch.float16,10,256,7168,256,8,0,0,asm,10011+20001,283.1306 +gfx938,f8_w8a8_channel,torch.float16,11,256,7168,256,8,0,0,asm,10002+20001,309.4865 +gfx938,f8_w8a8_channel,torch.float16,12,256,7168,256,8,0,0,asm,10002+20001,321.3357 +gfx938,f8_w8a8_channel,torch.float16,13,256,7168,256,8,0,0,asm,10008+20001,338.1219 +gfx938,f8_w8a8_channel,torch.float16,14,256,7168,256,8,0,0,asm,10002+20001,358.7315 +gfx938,f8_w8a8_channel,torch.float16,15,256,7168,256,8,0,0,asm,10011+20001,368.3992 +gfx938,f8_w8a8_channel,torch.float16,16,256,7168,256,8,0,0,asm,10011+20001,378.1179 +gfx938,f8_w8a8_channel,torch.float16,17,256,7168,256,8,0,0,asm,10011+20001,381.3561 +gfx938,f8_w8a8_channel,torch.float16,18,256,7168,256,8,0,0,asm,10011+20001,397.2116 +gfx938,f8_w8a8_channel,torch.float16,20,256,7168,256,8,0,0,asm,10002+20001,431.1681 +gfx938,f8_w8a8_channel,torch.float16,24,256,7168,256,8,0,0,asm,10011+20001,479.9186 +gfx938,f8_w8a8_channel,torch.float16,28,256,7168,256,8,0,0,asm,10002+20001,552.5507 +gfx938,f8_w8a8_channel,torch.float16,32,256,7168,256,8,0,0,asm,10011+20001,587.4382 +gfx938,f8_w8a8_channel,torch.float16,34,256,7168,256,8,0,0,asm,10011+20001,579.5075 +gfx938,f8_w8a8_channel,torch.float16,36,256,7168,256,8,0,0,asm,10011+20001,600.686 +gfx938,f8_w8a8_channel,torch.float16,40,256,7168,256,8,0,0,asm,10002+20001,623.1973 +gfx938,f8_w8a8_channel,torch.float16,44,256,7168,256,8,0,0,asm,10002+20001,651.7723 +gfx938,f8_w8a8_channel,torch.float16,48,256,7168,256,8,0,0,asm,10002+20001,663.6823 +gfx938,f8_w8a8_channel,torch.float16,56,256,7168,256,8,0,0,asm,10002+20001,699.227 +gfx938,f8_w8a8_channel,torch.float16,64,256,7168,256,8,0,0,asm,10002+20001,718.8464 +gfx938,f8_w8a8_channel,torch.float16,68,256,7168,256,8,0,0,asm,10002+20001,727.4162 +gfx938,f8_w8a8_channel,torch.float16,72,256,7168,256,8,0,0,asm,10002+20000,752.4242 +gfx938,f8_w8a8_channel,torch.float16,80,256,7168,256,8,0,0,asm,10002+20001,798.6361 +gfx938,f8_w8a8_channel,torch.float16,88,256,7168,256,8,0,0,asm,10011+20001,803.9207 +gfx938,f8_w8a8_channel,torch.float16,96,256,7168,256,8,0,0,asm,10011+20001,797.7572 +gfx938,f8_w8a8_channel,torch.float16,104,256,7168,256,8,0,0,asm,10002+20001,817.8816 +gfx938,f8_w8a8_channel,torch.float16,112,256,7168,256,8,0,0,asm,10011+20001,819.8212 +gfx938,f8_w8a8_channel,torch.float16,128,256,7168,256,8,0,0,asm,10002+20001,836.1965 +gfx938,f8_w8a8_channel,torch.float16,144,256,7168,256,8,0,0,asm,10002+20001,844.6419 +gfx938,f8_w8a8_channel,torch.float16,160,256,7168,256,8,0,0,asm,10002+20001,861.065 +gfx938,f8_w8a8_channel,torch.float16,192,256,7168,256,8,0,0,asm,10002+20001,870.0983 +gfx938,f8_w8a8_channel,torch.float16,224,256,7168,256,8,0,0,asm,10002+20001,904.8108 +gfx938,f8_w8a8_channel,torch.float16,256,256,7168,256,8,0,0,asm,10002+20001,900.3962 +gfx938,f8_w8a8_channel,torch.float16,320,256,7168,256,8,0,0,asm,10011+20001,938.3914 +gfx938,f8_w8a8_channel,torch.float16,384,256,7168,256,8,0,0,asm,10011+20001,950.1246 +gfx938,f8_w8a8_channel,torch.float16,448,256,7168,256,8,0,0,asm,11005+21001,991.3086 +gfx938,f8_w8a8_channel,torch.float16,512,256,7168,256,8,0,0,asm,11005+21001,1024.873 +gfx938,f8_w8a8_channel,torch.float16,576,256,7168,256,8,0,0,asm,11003+21001,1085.3904 +gfx938,f8_w8a8_channel,torch.float16,640,256,7168,256,8,0,0,asm,11005+21001,1064.9267 +gfx938,f8_w8a8_channel,torch.float16,704,256,7168,256,8,0,0,asm,11005+21001,1082.1236 +gfx938,f8_w8a8_channel,torch.float16,768,256,7168,256,8,0,0,asm,11003+21001,1086.2148 +gfx938,f8_w8a8_channel,torch.float16,832,256,7168,256,8,0,0,asm,11005+21001,1118.0076 +gfx938,f8_w8a8_channel,torch.float16,896,256,7168,256,8,0,0,asm,11005+21001,1177.0868 +gfx938,f8_w8a8_channel,torch.float16,960,256,7168,256,8,0,0,asm,11005+21001,1261.5258 +gfx938,f8_w8a8_channel,torch.float16,1024,256,7168,256,8,0,0,asm,12001+22001,1321.1087 +gfx938,f8_w8a8_channel,torch.float16,1152,256,7168,256,8,0,0,asm,12001+22001,1378.0109 +gfx938,f8_w8a8_channel,torch.float16,1280,256,7168,256,8,0,0,asm,12001+22001,1392.1021 +gfx938,f8_w8a8_channel,torch.float16,1408,256,7168,256,8,0,0,asm,12001+22001,1430.3588 +gfx938,f8_w8a8_channel,torch.float16,1536,256,7168,256,8,0,0,asm,12001+22001,1464.291 +gfx938,f8_w8a8_channel,torch.float16,1664,256,7168,256,8,0,0,asm,12003+22001,1505.5708 +gfx938,f8_w8a8_channel,torch.float16,1792,256,7168,256,8,0,0,asm,12001+22001,1589.9691 +gfx938,f8_w8a8_channel,torch.float16,1920,256,7168,256,8,0,0,asm,12001+22001,1632.3197 +gfx938,f8_w8a8_channel,torch.float16,2048,256,7168,256,8,0,0,asm,12001+22001,1866.2468 +gfx938,f8_w8a8_channel,torch.float16,2304,256,7168,256,8,0,0,asm,13001+23001,2048.1579 +gfx938,f8_w8a8_channel,torch.float16,2560,256,7168,256,8,0,0,asm,13001+23001,2092.4777 +gfx938,f8_w8a8_channel,torch.float16,2816,256,7168,256,8,0,0,asm,13001+23001,2139.2016 +gfx938,f8_w8a8_channel,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23001,2186.3933 +gfx938,f8_w8a8_channel,torch.float16,3328,256,7168,256,8,0,0,asm,13001+23001,2256.4647 +gfx938,f8_w8a8_channel,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23001,2336.6121 +gfx938,f8_w8a8_channel,torch.float16,3840,256,7168,256,8,0,0,asm,13001+23001,2576.566 +gfx938,f8_w8a8_channel,torch.float16,4096,256,7168,256,8,0,0,asm,13001+23001,2972.2115 +gfx938,f8_w8a8_channel,torch.float16,4608,256,7168,256,8,0,0,asm,12001+22001,3615.1791 +gfx938,f8_w8a8_channel,torch.float16,5120,256,7168,256,8,0,0,asm,12001+22001,3819.4244 +gfx938,f8_w8a8_channel,torch.float16,5632,256,7168,256,8,0,0,asm,12001+22001,4030.1803 +gfx938,f8_w8a8_channel,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23001,4172.1696 +gfx938,f8_w8a8_channel,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23001,4255.9334 +gfx938,f8_w8a8_channel,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23001,4370.7083 +gfx938,f8_w8a8_channel,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23001,4623.5482 +gfx938,f8_w8a8_channel,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23001,5132.8724 +gfx938,f8_w8a8_channel,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23001,6291.0284 +gfx938,f8_w8a8_channel,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23001,7314.9996 +gfx938,f8_w8a8_channel,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23001,8463.126 +gfx938,f8_w8a8_channel,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23001,9513.7307 +gfx938,f8_w8a8_channel,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23001,10340.0984 +gfx938,f8_w8a8_channel,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23001,13947.7674 +gfx938,f8_w8a8_channel,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23001,18301.4314 +gfx938,f8_w8a8_channel,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23001,22619.4892 +gfx938,f8_w8a8_channel,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23001,26910.4336 +gfx938,f8_w8a8_channel,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23001,31283.1318 +gfx938,f8_w8a8_channel,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23001,35592.7466 +gfx938,f8_w8a8_channel,torch.float16,1,352,4096,128,8,0,0,asm,10008+20001,59.0495 +gfx938,f8_w8a8_channel,torch.float16,2,352,4096,128,8,0,0,asm,10011+20000,81.9294 +gfx938,f8_w8a8_channel,torch.float16,4,352,4096,128,8,0,0,asm,10002+20000,130.3336 +gfx938,f8_w8a8_channel,torch.float16,6,352,4096,128,8,0,0,asm,10011+20000,163.967 +gfx938,f8_w8a8_channel,torch.float16,8,352,4096,128,8,0,0,asm,10008+20000,209.4489 +gfx938,f8_w8a8_channel,torch.float16,12,352,4096,128,8,0,0,asm,10002+20000,275.9665 +gfx938,f8_w8a8_channel,torch.float16,16,352,4096,128,8,0,0,asm,10011+20000,307.3096 +gfx938,f8_w8a8_channel,torch.float16,20,352,4096,128,8,0,0,asm,10011+20000,360.1989 +gfx938,f8_w8a8_channel,torch.float16,24,352,4096,128,8,0,0,asm,10011+20000,364.7463 +gfx938,f8_w8a8_channel,torch.float16,28,352,4096,128,8,0,0,asm,10011+20000,372.2441 +gfx938,f8_w8a8_channel,torch.float16,32,352,4096,128,8,0,0,asm,10011+20000,375.8735 +gfx938,f8_w8a8_channel,torch.float16,36,352,4096,128,8,0,0,asm,10011+20000,390.2199 +gfx938,f8_w8a8_channel,torch.float16,40,352,4096,128,8,0,0,asm,10011+20000,398.8093 +gfx938,f8_w8a8_channel,torch.float16,44,352,4096,128,8,0,0,asm,10002+20000,422.6661 +gfx938,f8_w8a8_channel,torch.float16,48,352,4096,128,8,0,0,asm,10002+20000,432.2913 +gfx938,f8_w8a8_channel,torch.float16,56,352,4096,128,8,0,0,asm,10002+20000,431.5165 +gfx938,f8_w8a8_channel,torch.float16,64,352,4096,128,8,0,0,asm,10002+20000,443.0479 +gfx938,f8_w8a8_channel,torch.float16,72,352,4096,128,8,0,0,asm,10002+20000,437.2681 +gfx938,f8_w8a8_channel,torch.float16,80,352,4096,128,8,0,0,asm,10011+20000,440.7965 +gfx938,f8_w8a8_channel,torch.float16,96,352,4096,128,8,0,0,asm,10002+20000,443.7775 +gfx938,f8_w8a8_channel,torch.float16,104,352,4096,128,8,0,0,asm,10002+20000,444.527 +gfx938,f8_w8a8_channel,torch.float16,112,352,4096,128,8,0,0,asm,10002+20000,446.3712 +gfx938,f8_w8a8_channel,torch.float16,128,352,4096,128,8,0,0,asm,10002+20000,446.9692 +gfx938,f8_w8a8_channel,torch.float16,144,352,4096,128,8,0,0,asm,10011+20000,506.7920 +gfx938,f8_w8a8_channel,torch.float16,160,352,4096,128,8,0,0,asm,10002+20000,459.6512 +gfx938,f8_w8a8_channel,torch.float16,192,352,4096,128,8,0,0,asm,10011+20000,463.1543 +gfx938,f8_w8a8_channel,torch.float16,224,352,4096,128,8,0,0,asm,11005+21000,482.8511 +gfx938,f8_w8a8_channel,torch.float16,256,352,4096,128,8,0,0,asm,11005+21000,497.2088 +gfx938,f8_w8a8_channel,torch.float16,320,352,4096,128,8,0,0,asm,11003+21000,509.8992 +gfx938,f8_w8a8_channel,torch.float16,384,352,4096,128,8,0,0,asm,12005+22000,566.4802 +gfx938,f8_w8a8_channel,torch.float16,448,352,4096,128,8,0,0,asm,11005+21000,548.4002 +gfx938,f8_w8a8_channel,torch.float16,512,352,4096,128,8,0,0,asm,12001+22001,573.5201 +gfx938,f8_w8a8_channel,torch.float16,768,352,4096,128,8,0,0,asm,12005+22000,661.2503 +gfx938,f8_w8a8_channel,torch.float16,896,352,4096,128,8,0,0,asm,12005+22001,671.4481 +gfx938,f8_w8a8_channel,torch.float16,960,352,4096,128,8,0,0,asm,12001+22001,693.4522 +gfx938,f8_w8a8_channel,torch.float16,1024,352,4096,128,8,0,0,asm,12001+22001,761.8308 +gfx938,f8_w8a8_channel,torch.float16,1280,352,4096,128,8,0,0,asm,13001+23001,849.0305 +gfx938,f8_w8a8_channel,torch.float16,1536,352,4096,128,8,0,0,asm,13001+23001,899.2703 +gfx938,f8_w8a8_channel,torch.float16,1920,352,4096,128,8,0,0,asm,13001+23001,1012.836 +gfx938,f8_w8a8_channel,torch.float16,2048,352,4096,128,8,0,0,asm,13001+23001,1191.8837 +gfx938,f8_w8a8_channel,torch.float16,2304,352,4096,128,8,0,0,asm,12005+22001,1445.8615 +gfx938,f8_w8a8_channel,torch.float16,2560,352,4096,128,8,0,0,asm,12005+22001,1491.5539 +gfx938,f8_w8a8_channel,torch.float16,3072,352,4096,128,8,0,0,asm,13001+23001,1566.3831 +gfx938,f8_w8a8_channel,torch.float16,3584,352,4096,128,8,0,0,asm,13001+23001,1651.9658 +gfx938,f8_w8a8_channel,torch.float16,3840,352,4096,128,8,0,0,asm,13001+23001,1747.4601 +gfx938,f8_w8a8_channel,torch.float16,4096,352,4096,128,8,0,0,asm,13001+23001,1971.5349 +gfx938,f8_w8a8_channel,torch.float16,4608,352,4096,128,8,0,0,asm,13001+23001,2280.3588 +gfx938,f8_w8a8_channel,torch.float16,5120,352,4096,128,8,0,0,asm,13001+23001,2363.9879 +gfx938,f8_w8a8_channel,torch.float16,6144,352,4096,128,8,0,0,asm,13001+23001,2770.6472 +gfx938,f8_w8a8_channel,torch.float16,7168,352,4096,128,8,0,0,asm,13001+23001,3191.6222 +gfx938,f8_w8a8_channel,torch.float16,8192,352,4096,128,8,0,0,asm,13001+23001,3572.9594 +gfx938,f8_w8a8_channel,torch.float16,10240,352,4096,128,8,0,0,asm,13001+23001,4374.4799 +gfx938,f8_w8a8_channel,torch.float16,12288,352,4096,128,8,0,0,asm,13001+23001,5152.6491 +gfx938,f8_w8a8_channel,torch.float16,16384,352,4096,128,8,0,0,asm,13001+23001,6720.6083 +gfx938,f8_w8a8_channel,torch.float16,24576,352,4096,128,8,0,0,asm,13001+23001,10204.6509 +gfx938,f8_w8a8_channel,torch.float16,32768,352,4096,128,8,0,0,asm,13001+23001,13134.8606 +gfx938,f8_w8a8_channel,torch.float16,1,352,4096,129,9,0,0,asm,10008+20000,59.1505 +gfx938,f8_w8a8_channel,torch.float16,2,352,4096,129,9,0,0,asm,10011+20001,84.2789 +gfx938,f8_w8a8_channel,torch.float16,4,352,4096,129,9,0,0,asm,10008+20000,133.5504 +gfx938,f8_w8a8_channel,torch.float16,6,352,4096,129,9,0,0,asm,10011+20000,169.0869 +gfx938,f8_w8a8_channel,torch.float16,8,352,4096,129,9,0,0,asm,10011+20000,214.6952 +gfx938,f8_w8a8_channel,torch.float16,12,352,4096,129,9,0,0,asm,10011+20000,269.0782 +gfx938,f8_w8a8_channel,torch.float16,16,352,4096,129,9,0,0,asm,10011+20000,304.5727 +gfx938,f8_w8a8_channel,torch.float16,20,352,4096,129,9,0,0,asm,10011+20000,348.6201 +gfx938,f8_w8a8_channel,torch.float16,24,352,4096,129,9,0,0,asm,10011+20000,358.1189 +gfx938,f8_w8a8_channel,torch.float16,28,352,4096,129,9,0,0,asm,10011+20000,355.6209 +gfx938,f8_w8a8_channel,torch.float16,32,352,4096,129,9,0,0,asm,10011+20000,375.9661 +gfx938,f8_w8a8_channel,torch.float16,36,352,4096,129,9,0,0,asm,10011+20000,401.5798 +gfx938,f8_w8a8_channel,torch.float16,40,352,4096,129,9,0,0,asm,10002+20000,416.6366 +gfx938,f8_w8a8_channel,torch.float16,44,352,4096,129,9,0,0,asm,10002+20000,432.7375 +gfx938,f8_w8a8_channel,torch.float16,48,352,4096,129,9,0,0,asm,10002+20000,436.7544 +gfx938,f8_w8a8_channel,torch.float16,56,352,4096,129,9,0,0,asm,10011+20000,439.3396 +gfx938,f8_w8a8_channel,torch.float16,64,352,4096,129,9,0,0,asm,10002+20000,448.3868 +gfx938,f8_w8a8_channel,torch.float16,72,352,4096,129,9,0,0,asm,10011+20000,443.5838 +gfx938,f8_w8a8_channel,torch.float16,80,352,4096,129,9,0,0,asm,10002+20000,446.7838 +gfx938,f8_w8a8_channel,torch.float16,96,352,4096,129,9,0,0,asm,10002+20000,449.3269 +gfx938,f8_w8a8_channel,torch.float16,104,352,4096,129,9,0,0,asm,10002+20000,452.3079 +gfx938,f8_w8a8_channel,torch.float16,112,352,4096,129,9,0,0,asm,10002+20000,451.1291 +gfx938,f8_w8a8_channel,torch.float16,128,352,4096,129,9,0,0,asm,10002+20000,454.4974 +gfx938,f8_w8a8_channel,torch.float16,144,352,4096,129,9,0,0,asm,10002+20001,481.6721 +gfx938,f8_w8a8_channel,torch.float16,160,352,4096,129,9,0,0,asm,10011+20000,459.5079 +gfx938,f8_w8a8_channel,torch.float16,192,352,4096,129,9,0,0,asm,11005+21000,480.771 +gfx938,f8_w8a8_channel,torch.float16,224,352,4096,129,9,0,0,asm,11005+21000,488.4005 +gfx938,f8_w8a8_channel,torch.float16,256,352,4096,129,9,0,0,asm,11005+21000,495.8952 +gfx938,f8_w8a8_channel,torch.float16,320,352,4096,129,9,0,0,asm,11005+21000,511.9877 +gfx938,f8_w8a8_channel,torch.float16,384,352,4096,129,9,0,0,asm,11005+21000,551.8949 +gfx938,f8_w8a8_channel,torch.float16,448,352,4096,129,9,0,0,asm,12005+22001,566.3791 +gfx938,f8_w8a8_channel,torch.float16,512,352,4096,129,9,0,0,asm,12005+22001,582.6737 +gfx938,f8_w8a8_channel,torch.float16,768,352,4096,129,9,0,0,asm,12001+22001,631.3219 +gfx938,f8_w8a8_channel,torch.float16,896,352,4096,129,9,0,0,asm,12001+22001,741.4014 +gfx938,f8_w8a8_channel,torch.float16,960,352,4096,129,9,0,0,asm,12001+22001,828.7526 +gfx938,f8_w8a8_channel,torch.float16,1024,352,4096,129,9,0,0,asm,13001+23001,848.8536 +gfx938,f8_w8a8_channel,torch.float16,1280,352,4096,129,9,0,0,asm,13001+23001,876.4829 +gfx938,f8_w8a8_channel,torch.float16,1536,352,4096,129,9,0,0,asm,13001+23001,915.5733 +gfx938,f8_w8a8_channel,torch.float16,1920,352,4096,129,9,0,0,asm,13001+23001,1321.8536 +gfx938,f8_w8a8_channel,torch.float16,2048,352,4096,129,9,0,0,asm,12001+22001,1418.2911 +gfx938,f8_w8a8_channel,torch.float16,2304,352,4096,129,9,0,0,asm,12001+22001,1505.9959 +gfx938,f8_w8a8_channel,torch.float16,2560,352,4096,129,9,0,0,asm,13001+23001,1598.4671 +gfx938,f8_w8a8_channel,torch.float16,3072,352,4096,129,9,0,0,asm,13001+23001,1680.7319 +gfx938,f8_w8a8_channel,torch.float16,3584,352,4096,129,9,0,0,asm,13001+23001,1969.1433 +gfx938,f8_w8a8_channel,torch.float16,3840,352,4096,129,9,0,0,asm,13001+23001,2186.6076 +gfx938,f8_w8a8_channel,torch.float16,4096,352,4096,129,9,0,0,asm,13001+23001,2298.5144 +gfx938,f8_w8a8_channel,torch.float16,4608,352,4096,129,9,0,0,asm,13001+23001,2375.6426 +gfx938,f8_w8a8_channel,torch.float16,5120,352,4096,129,9,0,0,asm,13001+23001,2498.682 +gfx938,f8_w8a8_channel,torch.float16,6144,352,4096,129,9,0,0,asm,13001+23001,3104.0772 +gfx938,f8_w8a8_channel,torch.float16,7168,352,4096,129,9,0,0,asm,13001+23001,3422.5768 +gfx938,f8_w8a8_channel,torch.float16,8192,352,4096,129,9,0,0,asm,13001+23001,4048.6372 +gfx938,f8_w8a8_channel,torch.float16,10240,352,4096,129,9,0,0,asm,13001+23001,5000.6413 +gfx938,f8_w8a8_channel,torch.float16,12288,352,4096,129,9,0,0,asm,13001+23001,5698.078 +gfx938,f8_w8a8_channel,torch.float16,16384,352,4096,129,9,0,0,asm,13001+23001,7470.3985 +gfx938,f8_w8a8_channel,torch.float16,24576,352,4096,129,9,0,0,asm,13001+23001,11072.2942 +gfx938,f8_w8a8_channel,torch.float16,32768,352,4096,129,9,0,0,asm,13001+23001,14596.8846 +gfx936,int8_w8a8_channel,torch.float16,1,384,3072,256,8,0,0,asm,10008+20000,56.8264 +gfx936,int8_w8a8_channel,torch.float16,2,384,3072,256,8,0,0,asm,10013+20000,81.7951 +gfx936,int8_w8a8_channel,torch.float16,4,384,3072,256,8,0,0,asm,10002+20000,132.7506 +gfx936,int8_w8a8_channel,torch.float16,6,384,3072,256,8,0,0,asm,10002+20000,179.5884 +gfx936,int8_w8a8_channel,torch.float16,8,384,3072,256,8,0,0,asm,10002+20000,215.5043 +gfx936,int8_w8a8_channel,torch.float16,12,384,3072,256,8,0,0,asm,10002+20000,285.9295 +gfx936,int8_w8a8_channel,torch.float16,16,384,3072,256,8,0,0,asm,10002+20000,345.4998 +gfx936,int8_w8a8_channel,torch.float16,24,384,3072,256,8,0,0,asm,10002+20000,455.0041 +gfx936,int8_w8a8_channel,torch.float16,32,384,3072,256,8,0,0,asm,10002+20000,557.1627 +gfx936,int8_w8a8_channel,torch.float16,36,384,3072,256,8,0,0,asm,10002+20000,575.2362 +gfx936,int8_w8a8_channel,torch.float16,48,384,3072,256,8,0,0,asm,10002+20000,648.0703 +gfx936,int8_w8a8_channel,torch.float16,56,384,3072,256,8,0,0,asm,10002+20000,681.6115 +gfx936,int8_w8a8_channel,torch.float16,64,384,3072,256,8,0,0,asm,10002+20000,709.3423 +gfx936,int8_w8a8_channel,torch.float16,72,384,3072,256,8,0,0,asm,10002+20000,733.8225 +gfx936,int8_w8a8_channel,torch.float16,80,384,3072,256,8,0,0,asm,10002+20000,762.3616 +gfx936,int8_w8a8_channel,torch.float16,88,384,3072,256,8,0,0,asm,10002+20000,786.6734 +gfx936,int8_w8a8_channel,torch.float16,96,384,3072,256,8,0,0,asm,10002+20000,799.2798 +gfx936,int8_w8a8_channel,torch.float16,100,384,3072,256,8,0,0,asm,10002+20000,800.8546 +gfx936,int8_w8a8_channel,torch.float16,112,384,3072,256,8,0,0,asm,10002+20000,818.8457 +gfx936,int8_w8a8_channel,torch.float16,128,384,3072,256,8,0,0,asm,10002+20000,839.4823 +gfx936,int8_w8a8_channel,torch.float16,144,384,3072,256,8,0,0,asm,10002+20000,847.5918 +gfx936,int8_w8a8_channel,torch.float16,160,384,3072,256,8,0,0,asm,10002+20000,856.3665 +gfx936,int8_w8a8_channel,torch.float16,192,384,3072,256,8,0,0,asm,10002+20000,871.7264 +gfx936,int8_w8a8_channel,torch.float16,224,384,3072,256,8,0,0,asm,10002+20000,887.1116 +gfx936,int8_w8a8_channel,torch.float16,256,384,3072,256,8,0,0,asm,10002+20000,899.8696 +gfx936,int8_w8a8_channel,torch.float16,320,384,3072,256,8,0,0,asm,10002+20000,925.1746 +gfx936,int8_w8a8_channel,torch.float16,384,384,3072,256,8,0,0,asm,11007+21000,950.7745 +gfx936,int8_w8a8_channel,torch.float16,448,384,3072,256,8,0,0,asm,11007+21000,963.7851 +gfx936,int8_w8a8_channel,torch.float16,512,384,3072,256,8,0,0,asm,11007+21000,983.2628 +gfx936,int8_w8a8_channel,torch.float16,640,384,3072,256,8,0,0,asm,11007+21000,1016.4499 +gfx936,int8_w8a8_channel,torch.float16,768,384,3072,256,8,0,0,asm,11007+21000,1044.2646 +gfx936,int8_w8a8_channel,torch.float16,896,384,3072,256,8,0,0,asm,11007+21001,1079.5319 +gfx936,int8_w8a8_channel,torch.float16,1024,384,3072,256,8,0,0,asm,12005+22001,1101.6539 +gfx936,int8_w8a8_channel,torch.float16,1280,384,3072,256,8,0,0,asm,12005+22001,1150.9423 +gfx936,int8_w8a8_channel,torch.float16,1536,384,3072,256,8,0,0,asm,12005+22001,1212.8369 +gfx936,int8_w8a8_channel,torch.float16,2048,384,3072,256,8,0,0,asm,12001+22001,1368.7025 +gfx936,int8_w8a8_channel,torch.float16,2304,384,3072,256,8,0,0,asm,13001+23001,1423.6751 +gfx936,int8_w8a8_channel,torch.float16,2560,384,3072,256,8,0,0,asm,13001+23001,1469.0813 +gfx936,int8_w8a8_channel,torch.float16,3072,384,3072,256,8,0,0,asm,13001+23001,1561.0895 +gfx936,int8_w8a8_channel,torch.float16,3584,384,3072,256,8,0,0,asm,13001+23001,1670.9839 +gfx936,int8_w8a8_channel,torch.float16,4096,384,3072,256,8,0,0,asm,13001+23001,1887.3709 +gfx936,int8_w8a8_channel,torch.float16,5120,384,3072,256,8,0,0,asm,13001+23001,2325.1719 +gfx936,int8_w8a8_channel,torch.float16,6144,384,3072,256,8,0,0,asm,13001+23001,2460.3279 +gfx936,int8_w8a8_channel,torch.float16,7168,384,3072,256,8,0,0,asm,13001+23001,2649.5652 +gfx936,int8_w8a8_channel,torch.float16,8192,384,3072,256,8,0,0,asm,13001+23001,3069.7829 +gfx936,int8_w8a8_channel,torch.float16,10240,384,3072,256,8,0,0,asm,13001+23001,3692.2064 +gfx936,int8_w8a8_channel,torch.float16,12288,384,3072,256,8,0,0,asm,13001+23001,4303.6906 +gfx936,int8_w8a8_channel,torch.float16,16384,384,3072,256,8,0,0,asm,13001+23001,5566.5242 +gfx936,int8_w8a8_channel,torch.float16,24576,384,3072,256,8,0,0,asm,13001+23001,8120.7216 +gfx936,int8_w8a8_channel,torch.float16,32768,384,3072,256,8,0,0,asm,13001+23001,10670.8687 +gfx936,int8_w8a8_channel,torch.float16,1,192,3072,256,8,0,0,asm,10002+20000,44.6414 +gfx936,int8_w8a8_channel,torch.float16,2,192,3072,256,8,0,0,asm,10008+20000,58.1486 +gfx936,int8_w8a8_channel,torch.float16,4,192,3072,256,8,0,0,asm,10002+20000,84.5319 +gfx936,int8_w8a8_channel,torch.float16,6,192,3072,256,8,0,0,asm,10002+20000,111.858 +gfx936,int8_w8a8_channel,torch.float16,8,192,3072,256,8,0,0,asm,10008+20001,130.1991 +gfx936,int8_w8a8_channel,torch.float16,12,192,3072,256,8,0,0,asm,10002+20000,173.3315 +gfx936,int8_w8a8_channel,torch.float16,16,192,3072,256,8,0,0,asm,10002+20000,204.5231 +gfx936,int8_w8a8_channel,torch.float16,24,192,3072,256,8,0,0,asm,10002+20000,261.8589 +gfx936,int8_w8a8_channel,torch.float16,32,192,3072,256,8,0,0,asm,10002+20001,320.0092 +gfx936,int8_w8a8_channel,torch.float16,36,192,3072,256,8,0,0,asm,10002+20001,322.6504 +gfx936,int8_w8a8_channel,torch.float16,48,192,3072,256,8,0,0,asm,10002+20001,361.5388 +gfx936,int8_w8a8_channel,torch.float16,56,192,3072,256,8,0,0,asm,10002+20001,380.0229 +gfx936,int8_w8a8_channel,torch.float16,64,192,3072,256,8,0,0,asm,10002+20001,391.8629 +gfx936,int8_w8a8_channel,torch.float16,72,192,3072,256,8,0,0,asm,10002+20001,410.7261 +gfx936,int8_w8a8_channel,torch.float16,80,192,3072,256,8,0,0,asm,10002+20001,423.1218 +gfx936,int8_w8a8_channel,torch.float16,88,192,3072,256,8,0,0,asm,10002+20001,437.1513 +gfx936,int8_w8a8_channel,torch.float16,96,192,3072,256,8,0,0,asm,10002+20001,440.9997 +gfx936,int8_w8a8_channel,torch.float16,100,192,3072,256,8,0,0,asm,10002+20001,443.8797 +gfx936,int8_w8a8_channel,torch.float16,112,192,3072,256,8,0,0,asm,10002+20001,451.1302 +gfx936,int8_w8a8_channel,torch.float16,128,192,3072,256,8,0,0,asm,10002+20001,462.8607 +gfx936,int8_w8a8_channel,torch.float16,144,192,3072,256,8,0,0,asm,10002+20001,469.9682 +gfx936,int8_w8a8_channel,torch.float16,160,192,3072,256,8,0,0,asm,10002+20001,474.6839 +gfx936,int8_w8a8_channel,torch.float16,192,192,3072,256,8,0,0,asm,10002+20001,485.3197 +gfx936,int8_w8a8_channel,torch.float16,224,192,3072,256,8,0,0,asm,10002+20001,491.1639 +gfx936,int8_w8a8_channel,torch.float16,256,192,3072,256,8,0,0,asm,10002+20001,498.7007 +gfx936,int8_w8a8_channel,torch.float16,320,192,3072,256,8,0,0,asm,10002+20001,512.8228 +gfx936,int8_w8a8_channel,torch.float16,384,192,3072,256,8,0,0,asm,10002+20001,533.9512 +gfx936,int8_w8a8_channel,torch.float16,448,192,3072,256,8,0,0,asm,11006+21001,550.4481 +gfx936,int8_w8a8_channel,torch.float16,512,192,3072,256,8,0,0,asm,11006+21001,566.9281 +gfx936,int8_w8a8_channel,torch.float16,640,192,3072,256,8,0,0,asm,11004+21001,590.7595 +gfx936,int8_w8a8_channel,torch.float16,768,192,3072,256,8,0,0,asm,11004+21001,620.5279 +gfx936,int8_w8a8_channel,torch.float16,896,192,3072,256,8,0,0,asm,11005+21001,647.4415 +gfx936,int8_w8a8_channel,torch.float16,1024,192,3072,256,8,0,0,asm,12004+22001,686.1783 +gfx936,int8_w8a8_channel,torch.float16,1280,192,3072,256,8,0,0,asm,12004+22001,723.6267 +gfx936,int8_w8a8_channel,torch.float16,1536,192,3072,256,8,0,0,asm,12000+22001,776.4603 +gfx936,int8_w8a8_channel,torch.float16,2048,192,3072,256,8,0,0,asm,12001+22001,931.5421 +gfx936,int8_w8a8_channel,torch.float16,2304,192,3072,256,8,0,0,asm,13001+23001,1029.8156 +gfx936,int8_w8a8_channel,torch.float16,2560,192,3072,256,8,0,0,asm,13001+23001,1062.6493 +gfx936,int8_w8a8_channel,torch.float16,3072,192,3072,256,8,0,0,asm,13001+23001,1136.1481 +gfx936,int8_w8a8_channel,torch.float16,3584,192,3072,256,8,0,0,asm,13000+23001,1235.8199 +gfx936,int8_w8a8_channel,torch.float16,4096,192,3072,256,8,0,0,asm,13001+23001,1479.2555 +gfx936,int8_w8a8_channel,torch.float16,5120,192,3072,256,8,0,0,asm,12000+22001,1810.9182 +gfx936,int8_w8a8_channel,torch.float16,6144,192,3072,256,8,0,0,asm,13001+23001,2028.7453 +gfx936,int8_w8a8_channel,torch.float16,7168,192,3072,256,8,0,0,asm,13001+23001,2175.0862 +gfx936,int8_w8a8_channel,torch.float16,8192,192,3072,256,8,0,0,asm,13001+23001,2538.3363 +gfx936,int8_w8a8_channel,torch.float16,10240,192,3072,256,8,0,0,asm,13001+23001,3077.1904 +gfx936,int8_w8a8_channel,torch.float16,12288,192,3072,256,8,0,0,asm,13001+23001,3606.2003 +gfx936,int8_w8a8_channel,torch.float16,16384,192,3072,256,8,0,0,asm,13001+23001,4650.6454 +gfx936,int8_w8a8_channel,torch.float16,24576,192,3072,256,8,0,0,asm,13001+23001,6805.7334 +gfx936,int8_w8a8_channel,torch.float16,32768,192,3072,256,8,0,0,asm,13001+23001,8965.0994 +gfx936,int8_w8a8_channel,torch.float16,1,192,5120,160,8,0,0,asm,10002+20000,51.2097 +gfx936,int8_w8a8_channel,torch.float16,2,192,5120,160,8,0,0,asm,10009+20001,72.2287 +gfx936,int8_w8a8_channel,torch.float16,4,192,5120,160,8,0,0,asm,10002+20000,107.8833 +gfx936,int8_w8a8_channel,torch.float16,6,192,5120,160,8,0,0,asm,10001+20000,147.9592 +gfx936,int8_w8a8_channel,torch.float16,8,192,5120,160,8,0,0,asm,10002+20000,183.6054 +gfx936,int8_w8a8_channel,torch.float16,12,192,5120,160,8,0,0,asm,10002+20001,251.9926 +gfx936,int8_w8a8_channel,torch.float16,16,192,5120,160,8,0,0,asm,10002+20001,294.4095 +gfx936,int8_w8a8_channel,torch.float16,24,192,5120,160,8,0,0,asm,10002+20001,354.6758 +gfx936,int8_w8a8_channel,torch.float16,32,192,5120,160,8,0,0,asm,10002+20001,398.502 +gfx936,int8_w8a8_channel,torch.float16,36,192,5120,160,8,0,0,asm,10002+20001,409.6315 +gfx936,int8_w8a8_channel,torch.float16,48,192,5120,160,8,0,0,asm,10002+20001,437.9347 +gfx936,int8_w8a8_channel,torch.float16,56,192,5120,160,8,0,0,asm,10002+20001,454.9199 +gfx936,int8_w8a8_channel,torch.float16,64,192,5120,160,8,0,0,asm,10002+20001,460.5957 +gfx936,int8_w8a8_channel,torch.float16,72,192,5120,160,8,0,0,asm,10002+20001,468.8146 +gfx936,int8_w8a8_channel,torch.float16,80,192,5120,160,8,0,0,asm,10002+20001,475.0883 +gfx936,int8_w8a8_channel,torch.float16,88,192,5120,160,8,0,0,asm,10002+20001,478.4988 +gfx936,int8_w8a8_channel,torch.float16,96,192,5120,160,8,0,0,asm,10002+20001,482.1788 +gfx936,int8_w8a8_channel,torch.float16,100,192,5120,160,8,0,0,asm,10002+20001,486.3304 +gfx936,int8_w8a8_channel,torch.float16,112,192,5120,160,8,0,0,asm,10002+20001,492.0398 +gfx936,int8_w8a8_channel,torch.float16,128,192,5120,160,8,0,0,asm,10002+20001,496.6546 +gfx936,int8_w8a8_channel,torch.float16,144,192,5120,160,8,0,0,asm,10002+20001,503.2904 +gfx936,int8_w8a8_channel,torch.float16,160,192,5120,160,8,0,0,asm,10002+20001,506.4402 +gfx936,int8_w8a8_channel,torch.float16,192,192,5120,160,8,0,0,asm,10002+20001,516.1665 +gfx936,int8_w8a8_channel,torch.float16,224,192,5120,160,8,0,0,asm,10002+20001,529.9265 +gfx936,int8_w8a8_channel,torch.float16,256,192,5120,160,8,0,0,asm,10002+20001,540.7391 +gfx936,int8_w8a8_channel,torch.float16,320,192,5120,160,8,0,0,asm,11006+21001,567.9896 +gfx936,int8_w8a8_channel,torch.float16,384,192,5120,160,8,0,0,asm,11006+21001,584.0148 +gfx936,int8_w8a8_channel,torch.float16,448,192,5120,160,8,0,0,asm,11006+21001,599.0296 +gfx936,int8_w8a8_channel,torch.float16,512,192,5120,160,8,0,0,asm,11005+21001,634.3727 +gfx936,int8_w8a8_channel,torch.float16,640,192,5120,160,8,0,0,asm,12001+22001,682.2043 +gfx936,int8_w8a8_channel,torch.float16,768,192,5120,160,8,0,0,asm,12001+22001,709.539 +gfx936,int8_w8a8_channel,torch.float16,896,192,5120,160,8,0,0,asm,12001+22001,739.1137 +gfx936,int8_w8a8_channel,torch.float16,1024,192,5120,160,8,0,0,asm,12004+22001,779.4253 +gfx936,int8_w8a8_channel,torch.float16,1280,192,5120,160,8,0,0,asm,12000+22001,909.4463 +gfx936,int8_w8a8_channel,torch.float16,1536,192,5120,160,8,0,0,asm,13001+23001,1025.7494 +gfx936,int8_w8a8_channel,torch.float16,2048,192,5120,160,8,0,0,asm,13001+23001,1125.4882 +gfx936,int8_w8a8_channel,torch.float16,2304,192,5120,160,8,0,0,asm,13001+23001,1233.5892 +gfx936,int8_w8a8_channel,torch.float16,2560,192,5120,160,8,0,0,asm,12000+22001,1461.5554 +gfx936,int8_w8a8_channel,torch.float16,3072,192,5120,160,8,0,0,asm,12000+22001,1716.0143 +gfx936,int8_w8a8_channel,torch.float16,3584,192,5120,160,8,0,0,asm,12000+22001,1865.3784 +gfx936,int8_w8a8_channel,torch.float16,4096,192,5120,160,8,0,0,asm,13001+23001,1995.8962 +gfx936,int8_w8a8_channel,torch.float16,5120,192,5120,160,8,0,0,asm,13001+23001,2466.4559 +gfx936,int8_w8a8_channel,torch.float16,6144,192,5120,160,8,0,0,asm,13001+23001,2914.4052 +gfx936,int8_w8a8_channel,torch.float16,7168,192,5120,160,8,0,0,asm,13001+23001,3161.0576 +gfx936,int8_w8a8_channel,torch.float16,8192,192,5120,160,8,0,0,asm,13001+23001,3814.2362 +gfx936,int8_w8a8_channel,torch.float16,10240,192,5120,160,8,0,0,asm,13001+23001,4517.2759 +gfx936,int8_w8a8_channel,torch.float16,12288,192,5120,160,8,0,0,asm,13001+23001,5242.8252 +gfx936,int8_w8a8_channel,torch.float16,16384,192,5120,160,8,0,0,asm,13001+23001,7024.2813 +gfx936,int8_w8a8_channel,torch.float16,24576,192,5120,160,8,0,0,asm,13001+23001,10233.6104 +gfx936,int8_w8a8_channel,torch.float16,32768,192,5120,160,8,0,0,asm,13001+23001,13478.1774 +gfx936,int8_w8a8_channel,torch.float16,1,96,5120,160,8,0,0,asm,10001+20000,43.2436 +gfx936,int8_w8a8_channel,torch.float16,2,96,5120,160,8,0,0,asm,10002+20001,54.5699 +gfx936,int8_w8a8_channel,torch.float16,4,96,5120,160,8,0,0,asm,10008+20001,75.2353 +gfx936,int8_w8a8_channel,torch.float16,6,96,5120,160,8,0,0,asm,10011+20001,94.9994 +gfx936,int8_w8a8_channel,torch.float16,8,96,5120,160,8,0,0,asm,10002+20001,112.7595 +gfx936,int8_w8a8_channel,torch.float16,12,96,5120,160,8,0,0,asm,10002+20001,153.0122 +gfx936,int8_w8a8_channel,torch.float16,16,96,5120,160,8,0,0,asm,10002+20001,173.0712 +gfx936,int8_w8a8_channel,torch.float16,24,96,5120,160,8,0,0,asm,10002+20001,204.1503 +gfx936,int8_w8a8_channel,torch.float16,32,96,5120,160,8,0,0,asm,10002+20001,235.4881 +gfx936,int8_w8a8_channel,torch.float16,36,96,5120,160,8,0,0,asm,10002+20001,235.5018 +gfx936,int8_w8a8_channel,torch.float16,48,96,5120,160,8,0,0,asm,10002+20001,249.2198 +gfx936,int8_w8a8_channel,torch.float16,56,96,5120,160,8,0,0,asm,10002+20001,262.0619 +gfx936,int8_w8a8_channel,torch.float16,64,96,5120,160,8,0,0,asm,10002+20001,265.0346 +gfx936,int8_w8a8_channel,torch.float16,72,96,5120,160,8,0,0,asm,10008+20001,270.1714 +gfx936,int8_w8a8_channel,torch.float16,80,96,5120,160,8,0,0,asm,10002+20001,271.9567 +gfx936,int8_w8a8_channel,torch.float16,88,96,5120,160,8,0,0,asm,10002+20001,272.5714 +gfx936,int8_w8a8_channel,torch.float16,96,96,5120,160,8,0,0,asm,10008+20001,277.5483 +gfx936,int8_w8a8_channel,torch.float16,100,96,5120,160,8,0,0,asm,10002+20001,280.3778 +gfx936,int8_w8a8_channel,torch.float16,112,96,5120,160,8,0,0,asm,10002+20001,281.6999 +gfx936,int8_w8a8_channel,torch.float16,128,96,5120,160,8,0,0,asm,10002+20001,285.6494 +gfx936,int8_w8a8_channel,torch.float16,144,96,5120,160,8,0,0,asm,10002+20001,290.7609 +gfx936,int8_w8a8_channel,torch.float16,160,96,5120,160,8,0,0,asm,10001+20001,296.5799 +gfx936,int8_w8a8_channel,torch.float16,192,96,5120,160,8,0,0,asm,10002+20001,300.6978 +gfx936,int8_w8a8_channel,torch.float16,224,96,5120,160,8,0,0,asm,10002+20001,310.3484 +gfx936,int8_w8a8_channel,torch.float16,256,96,5120,160,8,0,0,asm,10002+20001,321.8684 +gfx936,int8_w8a8_channel,torch.float16,320,96,5120,160,8,0,0,asm,11004+21001,347.6705 +gfx936,int8_w8a8_channel,torch.float16,384,96,5120,160,8,0,0,asm,11005+21001,361.161 +gfx936,int8_w8a8_channel,torch.float16,448,96,5120,160,8,0,0,asm,11005+21001,370.062 +gfx936,int8_w8a8_channel,torch.float16,512,96,5120,160,8,0,0,asm,11003+21001,392.8317 +gfx936,int8_w8a8_channel,torch.float16,640,96,5120,160,8,0,0,asm,11005+21001,449.4717 +gfx936,int8_w8a8_channel,torch.float16,768,96,5120,160,8,0,0,asm,12001+22001,488.2422 +gfx936,int8_w8a8_channel,torch.float16,896,96,5120,160,8,0,0,asm,12001+22001,513.1684 +gfx936,int8_w8a8_channel,torch.float16,1024,96,5120,160,8,0,0,asm,12001+22001,536.2168 +gfx936,int8_w8a8_channel,torch.float16,1280,96,5120,160,8,0,0,asm,12001+22001,650.3052 +gfx936,int8_w8a8_channel,torch.float16,1536,96,5120,160,8,0,0,asm,12001+22001,783.265 +gfx936,int8_w8a8_channel,torch.float16,2048,96,5120,160,8,0,0,asm,13001+23001,866.7007 +gfx936,int8_w8a8_channel,torch.float16,2304,96,5120,160,8,0,0,asm,13001+23001,931.2733 +gfx936,int8_w8a8_channel,torch.float16,2560,96,5120,160,8,0,0,asm,12001+22001,1096.5869 +gfx936,int8_w8a8_channel,torch.float16,3072,96,5120,160,8,0,0,asm,12001+22001,1280.1487 +gfx936,int8_w8a8_channel,torch.float16,3584,96,5120,160,8,0,0,asm,12001+22001,1389.3526 +gfx936,int8_w8a8_channel,torch.float16,4096,96,5120,160,8,0,0,asm,13001+23001,1541.3777 +gfx936,int8_w8a8_channel,torch.float16,5120,96,5120,160,8,0,0,asm,13001+23001,1930.5899 +gfx936,int8_w8a8_channel,torch.float16,6144,96,5120,160,8,0,0,asm,13001+23001,2287.2296 +gfx936,int8_w8a8_channel,torch.float16,7168,96,5120,160,8,0,0,asm,13001+23001,2472.2483 +gfx936,int8_w8a8_channel,torch.float16,8192,96,5120,160,8,0,0,asm,13001+23001,2966.6731 +gfx936,int8_w8a8_channel,torch.float16,10240,96,5120,160,8,0,0,asm,13001+23001,3557.0135 +gfx936,int8_w8a8_channel,torch.float16,12288,96,5120,160,8,0,0,asm,13001+23001,4122.2256 +gfx936,int8_w8a8_channel,torch.float16,16384,96,5120,160,8,0,0,asm,13001+23001,5528.9355 +gfx936,int8_w8a8_channel,torch.float16,24576,96,5120,160,8,0,0,asm,13001+23001,8067.3382 +gfx936,int8_w8a8_channel,torch.float16,32768,96,5120,160,8,0,0,asm,13001+23001,10613.811 +gfx936,int8_w8a8_channel,torch.float16,1,192,4096,128,8,0,0,asm,10002+20000,45.8035 +gfx936,int8_w8a8_channel,torch.float16,2,192,4096,128,8,0,0,asm,10002+20000,65.0792 +gfx936,int8_w8a8_channel,torch.float16,4,192,4096,128,8,0,0,asm,10002+20000,93.0033 +gfx936,int8_w8a8_channel,torch.float16,6,192,4096,128,8,0,0,asm,10001+20000,123.6391 +gfx936,int8_w8a8_channel,torch.float16,8,192,4096,128,8,0,0,asm,10001+20000,151.7235 +gfx936,int8_w8a8_channel,torch.float16,12,192,4096,128,8,0,0,asm,10002+20000,190.6369 +gfx936,int8_w8a8_channel,torch.float16,16,192,4096,128,8,0,0,asm,10001+20001,224.3127 +gfx936,int8_w8a8_channel,torch.float16,24,192,4096,128,8,0,0,asm,10002+20001,261.0761 +gfx936,int8_w8a8_channel,torch.float16,32,192,4096,128,8,0,0,asm,10001+20001,280.3211 +gfx936,int8_w8a8_channel,torch.float16,36,192,4096,128,8,0,0,asm,10002+20001,286.1538 +gfx936,int8_w8a8_channel,torch.float16,48,192,4096,128,8,0,0,asm,10001+20001,311.2739 +gfx936,int8_w8a8_channel,torch.float16,56,192,4096,128,8,0,0,asm,10001+20001,315.4338 +gfx936,int8_w8a8_channel,torch.float16,64,192,4096,128,8,0,0,asm,10001+20001,330.0894 +gfx936,int8_w8a8_channel,torch.float16,72,192,4096,128,8,0,0,asm,10001+20001,321.1854 +gfx936,int8_w8a8_channel,torch.float16,80,192,4096,128,8,0,0,asm,10001+20001,323.7706 +gfx936,int8_w8a8_channel,torch.float16,88,192,4096,128,8,0,0,asm,10002+20001,327.7454 +gfx936,int8_w8a8_channel,torch.float16,96,192,4096,128,8,0,0,asm,10002+20000,331.2233 +gfx936,int8_w8a8_channel,torch.float16,100,192,4096,128,8,0,0,asm,10002+20001,331.0465 +gfx936,int8_w8a8_channel,torch.float16,112,192,4096,128,8,0,0,asm,10002+20001,335.5349 +gfx936,int8_w8a8_channel,torch.float16,128,192,4096,128,8,0,0,asm,10002+20001,340.1243 +gfx936,int8_w8a8_channel,torch.float16,144,192,4096,128,8,0,0,asm,10002+20001,344.5454 +gfx936,int8_w8a8_channel,torch.float16,160,192,4096,128,8,0,0,asm,10002+20001,348.3854 +gfx936,int8_w8a8_channel,torch.float16,192,192,4096,128,8,0,0,asm,10002+20001,359.7033 +gfx936,int8_w8a8_channel,torch.float16,224,192,4096,128,8,0,0,asm,11006+21001,371.7958 +gfx936,int8_w8a8_channel,torch.float16,256,192,4096,128,8,0,0,asm,11005+21001,377.3874 +gfx936,int8_w8a8_channel,torch.float16,320,192,4096,128,8,0,0,asm,11004+21001,403.1727 +gfx936,int8_w8a8_channel,torch.float16,384,192,4096,128,8,0,0,asm,11004+21001,412.6969 +gfx936,int8_w8a8_channel,torch.float16,448,192,4096,128,8,0,0,asm,11005+21001,427.6864 +gfx936,int8_w8a8_channel,torch.float16,512,192,4096,128,8,0,0,asm,11005+21001,456.5368 +gfx936,int8_w8a8_channel,torch.float16,640,192,4096,128,8,0,0,asm,12004+22001,481.2694 +gfx936,int8_w8a8_channel,torch.float16,768,192,4096,128,8,0,0,asm,12001+22001,508.0230 +gfx936,int8_w8a8_channel,torch.float16,896,192,4096,128,8,0,0,asm,12000+22001,552.1999 +gfx936,int8_w8a8_channel,torch.float16,1024,192,4096,128,8,0,0,asm,12001+22001,606.8441 +gfx936,int8_w8a8_channel,torch.float16,1280,192,4096,128,8,0,0,asm,13001+23001,697.2861 +gfx936,int8_w8a8_channel,torch.float16,1536,192,4096,128,8,0,0,asm,13001+23001,738.6249 +gfx936,int8_w8a8_channel,torch.float16,2048,192,4096,128,8,0,0,asm,13001+23001,959.1636 +gfx936,int8_w8a8_channel,torch.float16,2304,192,4096,128,8,0,0,asm,12000+22001,1102.2878 +gfx936,int8_w8a8_channel,torch.float16,2560,192,4096,128,8,0,0,asm,12000+22001,1152.1824 +gfx936,int8_w8a8_channel,torch.float16,3072,192,4096,128,8,0,0,asm,13001+23001,1294.9864 +gfx936,int8_w8a8_channel,torch.float16,3584,192,4096,128,8,0,0,asm,13001+23001,1403.7105 +gfx936,int8_w8a8_channel,torch.float16,4096,192,4096,128,8,0,0,asm,13001+23001,1613.2093 +gfx936,int8_w8a8_channel,torch.float16,5120,192,4096,128,8,0,0,asm,13001+23001,1955.6931 +gfx936,int8_w8a8_channel,torch.float16,6144,192,4096,128,8,0,0,asm,13001+23001,2323.5159 +gfx936,int8_w8a8_channel,torch.float16,7168,192,4096,128,8,0,0,asm,13001+23001,2635.3051 +gfx936,int8_w8a8_channel,torch.float16,8192,192,4096,128,8,0,0,asm,13001+23001,2953.2079 +gfx936,int8_w8a8_channel,torch.float16,10240,192,4096,128,8,0,0,asm,13001+23001,3634.3272 +gfx936,int8_w8a8_channel,torch.float16,12288,192,4096,128,8,0,0,asm,13001+23001,4299.6487 +gfx936,int8_w8a8_channel,torch.float16,16384,192,4096,128,8,0,0,asm,13001+23001,5648.0011 +gfx936,int8_w8a8_channel,torch.float16,24576,192,4096,128,8,0,0,asm,13001+23001,8335.2828 +gfx936,int8_w8a8_channel,torch.float16,32768,192,4096,128,8,0,0,asm,13001+23001,11020.2319 +gfx936,int8_w8a8_channel,torch.float16,1,96,4096,128,8,0,0,asm,10003+20001,38.1151 +gfx936,int8_w8a8_channel,torch.float16,2,96,4096,128,8,0,0,asm,10002+20000,47.5549 +gfx936,int8_w8a8_channel,torch.float16,4,96,4096,128,8,0,0,asm,10008+20001,66.3425 +gfx936,int8_w8a8_channel,torch.float16,6,96,4096,128,8,0,0,asm,10011+20001,79.4709 +gfx936,int8_w8a8_channel,torch.float16,8,96,4096,128,8,0,0,asm,10011+20001,95.3781 +gfx936,int8_w8a8_channel,torch.float16,12,96,4096,128,8,0,0,asm,10001+20001,120.717 +gfx936,int8_w8a8_channel,torch.float16,16,96,4096,128,8,0,0,asm,10001+20001,137.7275 +gfx936,int8_w8a8_channel,torch.float16,24,96,4096,128,8,0,0,asm,10002+20001,154.5499 +gfx936,int8_w8a8_channel,torch.float16,32,96,4096,128,8,0,0,asm,10011+20001,169.9718 +gfx936,int8_w8a8_channel,torch.float16,36,96,4096,128,8,0,0,asm,10001+20001,170.0784 +gfx936,int8_w8a8_channel,torch.float16,48,96,4096,128,8,0,0,asm,10002+20001,183.2321 +gfx936,int8_w8a8_channel,torch.float16,56,96,4096,128,8,0,0,asm,10002+20001,185.6151 +gfx936,int8_w8a8_channel,torch.float16,64,96,4096,128,8,0,0,asm,10013+20001,200.6075 +gfx936,int8_w8a8_channel,torch.float16,72,96,4096,128,8,0,0,asm,10002+20001,188.1583 +gfx936,int8_w8a8_channel,torch.float16,80,96,4096,128,8,0,0,asm,10002+20001,189.2614 +gfx936,int8_w8a8_channel,torch.float16,88,96,4096,128,8,0,0,asm,10013+20001,192.1668 +gfx936,int8_w8a8_channel,torch.float16,96,96,4096,128,8,0,0,asm,10002+20001,193.8594 +gfx936,int8_w8a8_channel,torch.float16,100,96,4096,128,8,0,0,asm,10002+20001,195.5519 +gfx936,int8_w8a8_channel,torch.float16,112,96,4096,128,8,0,0,asm,10002+20001,196.9414 +gfx936,int8_w8a8_channel,torch.float16,128,96,4096,128,8,0,0,asm,10001+20001,198.7267 +gfx936,int8_w8a8_channel,torch.float16,144,96,4096,128,8,0,0,asm,10002+20001,204.8403 +gfx936,int8_w8a8_channel,torch.float16,160,96,4096,128,8,0,0,asm,10002+20001,208.2004 +gfx936,int8_w8a8_channel,torch.float16,192,96,4096,128,8,0,0,asm,10002+20001,218.5918 +gfx936,int8_w8a8_channel,torch.float16,224,96,4096,128,8,0,0,asm,11007+21001,226.6593 +gfx936,int8_w8a8_channel,torch.float16,256,96,4096,128,8,0,0,asm,11007+21001,230.8277 +gfx936,int8_w8a8_channel,torch.float16,320,96,4096,128,8,0,0,asm,11000+21001,257.3539 +gfx936,int8_w8a8_channel,torch.float16,384,96,4096,128,8,0,0,asm,11004+21001,273.194 +gfx936,int8_w8a8_channel,torch.float16,448,96,4096,128,8,0,0,asm,11004+21001,283.4255 +gfx936,int8_w8a8_channel,torch.float16,512,96,4096,128,8,0,0,asm,11005+21001,300.8992 +gfx936,int8_w8a8_channel,torch.float16,640,96,4096,128,8,0,0,asm,12005+22001,330.1791 +gfx936,int8_w8a8_channel,torch.float16,768,96,4096,128,8,0,0,asm,12005+22001,348.8991 +gfx936,int8_w8a8_channel,torch.float16,896,96,4096,128,8,0,0,asm,12000+22001,395.1896 +gfx936,int8_w8a8_channel,torch.float16,1024,96,4096,128,8,0,0,asm,12001+22001,431.7875 +gfx936,int8_w8a8_channel,torch.float16,1280,96,4096,128,8,0,0,asm,13001+23001,521.4631 +gfx936,int8_w8a8_channel,torch.float16,1536,96,4096,128,8,0,0,asm,13001+23001,554.6757 +gfx936,int8_w8a8_channel,torch.float16,2048,96,4096,128,8,0,0,asm,12001+22001,704.6884 +gfx936,int8_w8a8_channel,torch.float16,2304,96,4096,128,8,0,0,asm,12001+22001,816.0986 +gfx936,int8_w8a8_channel,torch.float16,2560,96,4096,128,8,0,0,asm,12001+22001,853.9596 +gfx936,int8_w8a8_channel,torch.float16,3072,96,4096,128,8,0,0,asm,12005+22001,990.5068 +gfx936,int8_w8a8_channel,torch.float16,3584,96,4096,128,8,0,0,asm,13001+23001,1099.4415 +gfx936,int8_w8a8_channel,torch.float16,4096,96,4096,128,8,0,0,asm,13001+23001,1252.3592 +gfx936,int8_w8a8_channel,torch.float16,5120,96,4096,128,8,0,0,asm,13001+23001,1523.34 +gfx936,int8_w8a8_channel,torch.float16,6144,96,4096,128,8,0,0,asm,13001+23001,1819.7692 +gfx936,int8_w8a8_channel,torch.float16,7168,96,4096,128,8,0,0,asm,13001+23001,2057.79 +gfx936,int8_w8a8_channel,torch.float16,8192,96,4096,128,8,0,0,asm,13001+23001,2314.2613 +gfx936,int8_w8a8_channel,torch.float16,10240,96,4096,128,8,0,0,asm,13001+23001,2847.3472 +gfx936,int8_w8a8_channel,torch.float16,12288,96,4096,128,8,0,0,asm,13001+23001,3375.0267 +gfx936,int8_w8a8_channel,torch.float16,16384,96,4096,128,8,0,0,asm,13001+23001,4440.3141 +gfx936,int8_w8a8_channel,torch.float16,24576,96,4096,128,8,0,0,asm,13001+23001,6558.0723 +gfx936,int8_w8a8_channel,torch.float16,32768,96,4096,128,8,0,0,asm,13001+23001,8674.584 +gfx936,int8_w8a8_channel,torch.float16,1,256,3072,256,8,0,0,asm,10002+20000,48.3972 +gfx936,int8_w8a8_channel,torch.float16,2,256,3072,256,8,0,0,asm,10002+20000,66.0139 +gfx936,int8_w8a8_channel,torch.float16,4,256,3072,256,8,0,0,asm,10013+20000,98.4014 +gfx936,int8_w8a8_channel,torch.float16,6,256,3072,256,8,0,0,asm,10002+20000,129.3739 +gfx936,int8_w8a8_channel,torch.float16,8,256,3072,256,8,0,0,asm,10013+20000,156.3802 +gfx936,int8_w8a8_channel,torch.float16,12,256,3072,256,8,0,0,asm,10002+20000,205.3233 +gfx936,int8_w8a8_channel,torch.float16,16,256,3072,256,8,0,0,asm,10002+20001,248.0685 +gfx936,int8_w8a8_channel,torch.float16,24,256,3072,256,8,0,0,asm,10002+20001,320.0738 +gfx936,int8_w8a8_channel,torch.float16,32,256,3072,256,8,0,0,asm,10002+20001,392.4726 +gfx936,int8_w8a8_channel,torch.float16,36,256,3072,256,8,0,0,asm,10002+20001,402.2885 +gfx936,int8_w8a8_channel,torch.float16,48,256,3072,256,8,0,0,asm,10002+20001,450.5242 +gfx936,int8_w8a8_channel,torch.float16,56,256,3072,256,8,0,0,asm,10002+20000,474.7179 +gfx936,int8_w8a8_channel,torch.float16,64,256,3072,256,8,0,0,asm,10002+20001,492.7474 +gfx936,int8_w8a8_channel,torch.float16,72,256,3072,256,8,0,0,asm,10002+20000,510.5494 +gfx936,int8_w8a8_channel,torch.float16,80,256,3072,256,8,0,0,asm,10002+20001,527.6442 +gfx936,int8_w8a8_channel,torch.float16,88,256,3072,256,8,0,0,asm,10002+20001,544.7388 +gfx936,int8_w8a8_channel,torch.float16,96,256,3072,256,8,0,0,asm,10002+20001,551.7368 +gfx936,int8_w8a8_channel,torch.float16,100,256,3072,256,8,0,0,asm,10002+20000,552.4694 +gfx936,int8_w8a8_channel,torch.float16,112,256,3072,256,8,0,0,asm,10002+20001,563.6272 +gfx936,int8_w8a8_channel,torch.float16,128,256,3072,256,8,0,0,asm,10002+20000,578.1029 +gfx936,int8_w8a8_channel,torch.float16,144,256,3072,256,8,0,0,asm,10002+20001,586.8777 +gfx936,int8_w8a8_channel,torch.float16,160,256,3072,256,8,0,0,asm,10002+20001,592.4525 +gfx936,int8_w8a8_channel,torch.float16,192,256,3072,256,8,0,0,asm,10002+20001,604.444 +gfx936,int8_w8a8_channel,torch.float16,224,256,3072,256,8,0,0,asm,10002+20001,615.3409 +gfx936,int8_w8a8_channel,torch.float16,256,256,3072,256,8,0,0,asm,10002+20001,623.1893 +gfx936,int8_w8a8_channel,torch.float16,320,256,3072,256,8,0,0,asm,10002+20001,639.0377 +gfx936,int8_w8a8_channel,torch.float16,384,256,3072,256,8,0,0,asm,11007+21001,657.9093 +gfx936,int8_w8a8_channel,torch.float16,448,256,3072,256,8,0,0,asm,11007+21001,668.823 +gfx936,int8_w8a8_channel,torch.float16,512,256,3072,256,8,0,0,asm,11007+21001,686.1451 +gfx936,int8_w8a8_channel,torch.float16,640,256,3072,256,8,0,0,asm,11007+21001,719.2651 +gfx936,int8_w8a8_channel,torch.float16,768,256,3072,256,8,0,0,asm,11007+21001,742.8187 +gfx936,int8_w8a8_channel,torch.float16,896,256,3072,256,8,0,0,asm,11007+21001,765.7492 +gfx936,int8_w8a8_channel,torch.float16,1024,256,3072,256,8,0,0,asm,11005+21001,805.9598 +gfx936,int8_w8a8_channel,torch.float16,1280,256,3072,256,8,0,0,asm,12005+22001,861.6987 +gfx936,int8_w8a8_channel,torch.float16,1536,256,3072,256,8,0,0,asm,12005+22001,920.4859 +gfx936,int8_w8a8_channel,torch.float16,2048,256,3072,256,8,0,0,asm,12005+22001,1047.6522 +gfx936,int8_w8a8_channel,torch.float16,2304,256,3072,256,8,0,0,asm,13001+23001,1113.9763 +gfx936,int8_w8a8_channel,torch.float16,2560,256,3072,256,8,0,0,asm,13001+23001,1148.7468 +gfx936,int8_w8a8_channel,torch.float16,3072,256,3072,256,8,0,0,asm,13001+23001,1226.7341 +gfx936,int8_w8a8_channel,torch.float16,3584,256,3072,256,8,0,0,asm,13001+23001,1336.0982 +gfx936,int8_w8a8_channel,torch.float16,4096,256,3072,256,8,0,0,asm,13001+23001,1522.7002 +gfx936,int8_w8a8_channel,torch.float16,5120,256,3072,256,8,0,0,asm,12001+22001,1872.0472 +gfx936,int8_w8a8_channel,torch.float16,6144,256,3072,256,8,0,0,asm,13001+23001,2030.5648 +gfx936,int8_w8a8_channel,torch.float16,7168,256,3072,256,8,0,0,asm,13001+23001,2183.6848 +gfx936,int8_w8a8_channel,torch.float16,8192,256,3072,256,8,0,0,asm,13001+23001,2521.495 +gfx936,int8_w8a8_channel,torch.float16,10240,256,3072,256,8,0,0,asm,13001+23001,3044.9724 +gfx936,int8_w8a8_channel,torch.float16,12288,256,3072,256,8,0,0,asm,13001+23001,3573.7635 +gfx936,int8_w8a8_channel,torch.float16,16384,256,3072,256,8,0,0,asm,13001+23001,4611.2867 +gfx936,int8_w8a8_channel,torch.float16,24576,256,3072,256,8,0,0,asm,13001+23001,6740.9099 +gfx936,int8_w8a8_channel,torch.float16,32768,256,3072,256,8,0,0,asm,13001+23001,8835.3837 +gfx936,int8_w8a8_channel,torch.float16,1,128,3072,256,8,0,0,asm,10004+20000,44.4477 +gfx936,int8_w8a8_channel,torch.float16,2,128,3072,256,8,0,0,asm,10002+20100,52.7761 +gfx936,int8_w8a8_channel,torch.float16,4,128,3072,256,8,0,0,asm,10002+20102,69.7361 +gfx936,int8_w8a8_channel,torch.float16,6,128,3072,256,8,0,0,asm,10002+20102,87.2939 +gfx936,int8_w8a8_channel,torch.float16,8,128,3072,256,8,0,0,asm,10010+20102,101.1886 +gfx936,int8_w8a8_channel,torch.float16,12,128,3072,256,8,0,0,asm,10002+20001,131.2096 +gfx936,int8_w8a8_channel,torch.float16,16,128,3072,256,8,0,0,asm,10002+20001,153.9296 +gfx936,int8_w8a8_channel,torch.float16,24,128,3072,256,8,0,0,asm,10002+20001,187.5521 +gfx936,int8_w8a8_channel,torch.float16,32,128,3072,256,8,0,0,asm,10002+20001,232.5991 +gfx936,int8_w8a8_channel,torch.float16,36,128,3072,256,8,0,0,asm,10002+20001,227.4087 +gfx936,int8_w8a8_channel,torch.float16,48,128,3072,256,8,0,0,asm,10002+20001,253.9349 +gfx936,int8_w8a8_channel,torch.float16,56,128,3072,256,8,0,0,asm,10002+20001,266.7097 +gfx936,int8_w8a8_channel,torch.float16,64,128,3072,256,8,0,0,asm,10002+20001,279.2823 +gfx936,int8_w8a8_channel,torch.float16,72,128,3072,256,8,0,0,asm,10002+20001,286.6423 +gfx936,int8_w8a8_channel,torch.float16,80,128,3072,256,8,0,0,asm,10002+20001,296.7054 +gfx936,int8_w8a8_channel,torch.float16,88,128,3072,256,8,0,0,asm,10002+20001,303.0211 +gfx936,int8_w8a8_channel,torch.float16,96,128,3072,256,8,0,0,asm,10002+20001,310.4316 +gfx936,int8_w8a8_channel,torch.float16,100,128,3072,256,8,0,0,asm,10002+20001,308.5117 +gfx936,int8_w8a8_channel,torch.float16,112,128,3072,256,8,0,0,asm,10002+20001,314.5159 +gfx936,int8_w8a8_channel,torch.float16,128,128,3072,256,8,0,0,asm,10002+20001,323.198 +gfx936,int8_w8a8_channel,torch.float16,144,128,3072,256,8,0,0,asm,10002+20001,326.8106 +gfx936,int8_w8a8_channel,torch.float16,160,128,3072,256,8,0,0,asm,10002+20001,332.4781 +gfx936,int8_w8a8_channel,torch.float16,192,128,3072,256,8,0,0,asm,10002+20001,333.9349 +gfx936,int8_w8a8_channel,torch.float16,224,128,3072,256,8,0,0,asm,10013+20001,342.6423 +gfx936,int8_w8a8_channel,torch.float16,256,128,3072,256,8,0,0,asm,10002+20001,344.8234 +gfx936,int8_w8a8_channel,torch.float16,320,128,3072,256,8,0,0,asm,10002+20001,355.7706 +gfx936,int8_w8a8_channel,torch.float16,384,128,3072,256,8,0,0,asm,10002+20001,372.5369 +gfx936,int8_w8a8_channel,torch.float16,448,128,3072,256,8,0,0,asm,11007+21001,379.2065 +gfx936,int8_w8a8_channel,torch.float16,512,128,3072,256,8,0,0,asm,11007+21001,395.7538 +gfx936,int8_w8a8_channel,torch.float16,640,128,3072,256,8,0,0,asm,11004+21001,416.5117 +gfx936,int8_w8a8_channel,torch.float16,768,128,3072,256,8,0,0,asm,11007+21001,439.7537 +gfx936,int8_w8a8_channel,torch.float16,896,128,3072,256,8,0,0,asm,11007+21001,443.0296 +gfx936,int8_w8a8_channel,torch.float16,1024,128,3072,256,8,0,0,asm,11005+21001,475.6358 +gfx936,int8_w8a8_channel,torch.float16,1280,128,3072,256,8,0,0,asm,12005+22001,515.2147 +gfx936,int8_w8a8_channel,torch.float16,1536,128,3072,256,8,0,0,asm,12005+22001,567.0547 +gfx936,int8_w8a8_channel,torch.float16,2048,128,3072,256,8,0,0,asm,12005+22001,659.6858 +gfx936,int8_w8a8_channel,torch.float16,2304,128,3072,256,8,0,0,asm,12001+21102,742.3382 +gfx936,int8_w8a8_channel,torch.float16,2560,128,3072,256,8,0,0,asm,12001+21102,780.6539 +gfx936,int8_w8a8_channel,torch.float16,3072,128,3072,256,8,0,0,asm,13001+23001,824.0141 +gfx936,int8_w8a8_channel,torch.float16,3584,128,3072,256,8,0,0,asm,13001+23001,908.1319 +gfx936,int8_w8a8_channel,torch.float16,4096,128,3072,256,8,0,0,asm,12001+22001,1038.8349 +gfx936,int8_w8a8_channel,torch.float16,5120,128,3072,256,8,0,0,asm,12001+22001,1267.7605 +gfx936,int8_w8a8_channel,torch.float16,6144,128,3072,256,8,0,0,asm,13001+23001,1429.2759 +gfx936,int8_w8a8_channel,torch.float16,7168,128,3072,256,8,0,0,asm,13001+23001,1555.8024 +gfx936,int8_w8a8_channel,torch.float16,8192,128,3072,256,8,0,0,asm,13001+23001,1789.4696 +gfx936,int8_w8a8_channel,torch.float16,10240,128,3072,256,8,0,0,asm,13001+23001,2165.9155 +gfx936,int8_w8a8_channel,torch.float16,12288,128,3072,256,8,0,0,asm,13001+23001,2530.2763 +gfx936,int8_w8a8_channel,torch.float16,16384,128,3072,256,8,0,0,asm,13001+23001,3264.3642 +gfx936,int8_w8a8_channel,torch.float16,24576,128,3072,256,8,0,0,asm,13001+23001,4759.7737 +gfx936,int8_w8a8_channel,torch.float16,32768,128,3072,256,8,0,0,asm,13001+23001,6279.1322 diff --git a/aiter/configs/tuned_fmoe_asm_w8a8_channel_shuffle.csv b/aiter/configs/tuned_fmoe_asm_w8a8_channel_shuffle.csv new file mode 100644 index 0000000000000000000000000000000000000000..805ed7dcb9538b32d84e275b5555c5fff6a5fa87 --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm_w8a8_channel_shuffle.csv @@ -0,0 +1,1044 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,int8_w8a8_channel,torch.float16,1,128,7168,256,8,0,0,asm,10003+20100,60.7153 +gfx936,int8_w8a8_channel,torch.float16,2,128,7168,256,8,0,0,asm,10002+20102,77.9869 +gfx936,int8_w8a8_channel,torch.float16,3,128,7168,256,8,0,0,asm,10009+20102,92.7455 +gfx936,int8_w8a8_channel,torch.float16,4,128,7168,256,8,0,0,asm,10010+20102,110.4505 +gfx936,int8_w8a8_channel,torch.float16,5,128,7168,256,8,0,0,asm,10006+20102,133.0372 +gfx936,int8_w8a8_channel,torch.float16,6,128,7168,256,8,0,0,asm,10005+20102,155.6895 +gfx936,int8_w8a8_channel,torch.float16,7,128,7168,256,8,0,0,asm,10001+20102,165.0761 +gfx936,int8_w8a8_channel,torch.float16,8,128,7168,256,8,0,0,asm,10005+20102,175.591 +gfx936,int8_w8a8_channel,torch.float16,9,128,7168,256,8,0,0,asm,10001+20102,188.4515 +gfx936,int8_w8a8_channel,torch.float16,10,128,7168,256,8,0,0,asm,10001+20102,202.0567 +gfx936,int8_w8a8_channel,torch.float16,11,128,7168,256,8,0,0,asm,10001+20102,216.0628 +gfx936,int8_w8a8_channel,torch.float16,12,128,7168,256,8,0,0,asm,10001+20102,227.7442 +gfx936,int8_w8a8_channel,torch.float16,13,128,7168,256,8,0,0,asm,10001+20102,241.5275 +gfx936,int8_w8a8_channel,torch.float16,14,128,7168,256,8,0,0,asm,10004+20102,254.7253 +gfx936,int8_w8a8_channel,torch.float16,15,128,7168,256,8,0,0,asm,10005+20102,261.3183 +gfx936,int8_w8a8_channel,torch.float16,16,128,7168,256,8,0,0,asm,10001+20102,266.8521 +gfx936,int8_w8a8_channel,torch.float16,17,128,7168,256,8,0,0,asm,10001+20102,268.9418 +gfx936,int8_w8a8_channel,torch.float16,18,128,7168,256,8,0,0,asm,10001+20102,279.5306 +gfx936,int8_w8a8_channel,torch.float16,20,128,7168,256,8,0,0,asm,10001+20102,304.5974 +gfx936,int8_w8a8_channel,torch.float16,24,128,7168,256,8,0,0,asm,10001+20102,347.4364 +gfx936,int8_w8a8_channel,torch.float16,28,128,7168,256,8,0,0,asm,10001+20102,394.1512 +gfx936,int8_w8a8_channel,torch.float16,32,128,7168,256,8,0,0,asm,10001+20001,420.4774 +gfx936,int8_w8a8_channel,torch.float16,34,128,7168,256,8,0,0,asm,10004+20001,423.2036 +gfx936,int8_w8a8_channel,torch.float16,36,128,7168,256,8,0,0,asm,10001+20001,436.3232 +gfx936,int8_w8a8_channel,torch.float16,40,128,7168,256,8,0,0,asm,10001+20102,452.7313 +gfx936,int8_w8a8_channel,torch.float16,44,128,7168,256,8,0,0,asm,10000+20001,472.2898 +gfx936,int8_w8a8_channel,torch.float16,48,128,7168,256,8,0,0,asm,10000+20001,484.0682 +gfx936,int8_w8a8_channel,torch.float16,56,128,7168,256,8,0,0,asm,10001+20001,510.1813 +gfx936,int8_w8a8_channel,torch.float16,64,128,7168,256,8,0,0,asm,10001+20001,530.0211 +gfx936,int8_w8a8_channel,torch.float16,68,128,7168,256,8,0,0,asm,10000+20001,534.858 +gfx936,int8_w8a8_channel,torch.float16,72,128,7168,256,8,0,0,asm,10000+20001,547.338 +gfx936,int8_w8a8_channel,torch.float16,80,128,7168,256,8,0,0,asm,10000+20001,563.2076 +gfx936,int8_w8a8_channel,torch.float16,88,128,7168,256,8,0,0,asm,10000+20001,582.2851 +gfx936,int8_w8a8_channel,torch.float16,96,128,7168,256,8,0,0,asm,10000+20001,587.5819 +gfx936,int8_w8a8_channel,torch.float16,104,128,7168,256,8,0,0,asm,10000+20001,593.8242 +gfx936,int8_w8a8_channel,torch.float16,112,128,7168,256,8,0,0,asm,10001+20001,603.3377 +gfx936,int8_w8a8_channel,torch.float16,128,128,7168,256,8,0,0,asm,10000+20001,617.634 +gfx936,int8_w8a8_channel,torch.float16,144,128,7168,256,8,0,0,asm,10000+20001,630.7514 +gfx936,int8_w8a8_channel,torch.float16,160,128,7168,256,8,0,0,asm,10000+20001,639.4196 +gfx936,int8_w8a8_channel,torch.float16,192,128,7168,256,8,0,0,asm,10000+20001,654.4355 +gfx936,int8_w8a8_channel,torch.float16,224,128,7168,256,8,0,0,asm,10000+20001,668.7697 +gfx936,int8_w8a8_channel,torch.float16,256,128,7168,256,8,0,0,asm,10000+20001,679.4586 +gfx936,int8_w8a8_channel,torch.float16,320,128,7168,256,8,0,0,asm,10001+20001,712.4597 +gfx936,int8_w8a8_channel,torch.float16,384,128,7168,256,8,0,0,asm,10002+20001,737.9884 +gfx936,int8_w8a8_channel,torch.float16,448,128,7168,256,8,0,0,asm,10002+20102,773.0358 +gfx936,int8_w8a8_channel,torch.float16,512,128,7168,256,8,0,0,asm,10006+20102,791.3841 +gfx936,int8_w8a8_channel,torch.float16,576,128,7168,256,8,0,0,asm,11000+21001,818.8247 +gfx936,int8_w8a8_channel,torch.float16,640,128,7168,256,8,0,0,asm,11000+21001,836.441 +gfx936,int8_w8a8_channel,torch.float16,704,128,7168,256,8,0,0,asm,11000+21001,862.0874 +gfx936,int8_w8a8_channel,torch.float16,768,128,7168,256,8,0,0,asm,11006+21001,885.7409 +gfx936,int8_w8a8_channel,torch.float16,832,128,7168,256,8,0,0,asm,11006+21001,914.526 +gfx936,int8_w8a8_channel,torch.float16,896,128,7168,256,8,0,0,asm,11006+21001,933.0011 +gfx936,int8_w8a8_channel,torch.float16,960,128,7168,256,8,0,0,asm,11006+21001,952.168 +gfx936,int8_w8a8_channel,torch.float16,1024,128,7168,256,8,0,0,asm,11006+21001,976.3539 +gfx936,int8_w8a8_channel,torch.float16,1152,128,7168,256,8,0,0,asm,11007+21001,1038.3866 +gfx936,int8_w8a8_channel,torch.float16,1280,128,7168,256,8,0,0,asm,12004+22001,1072.8371 +gfx936,int8_w8a8_channel,torch.float16,1408,128,7168,256,8,0,0,asm,12004+22001,1114.2061 +gfx936,int8_w8a8_channel,torch.float16,1536,128,7168,256,8,0,0,asm,12004+22001,1147.3019 +gfx936,int8_w8a8_channel,torch.float16,1664,128,7168,256,8,0,0,asm,12004+22001,1188.4794 +gfx936,int8_w8a8_channel,torch.float16,1792,128,7168,256,8,0,0,asm,11007+21001,1272.1697 +gfx936,int8_w8a8_channel,torch.float16,1920,128,7168,256,8,0,0,asm,12004+22001,1314.1846 +gfx936,int8_w8a8_channel,torch.float16,2048,128,7168,256,8,0,0,asm,12004+22001,1393.276 +gfx936,int8_w8a8_channel,torch.float16,2304,128,7168,256,8,0,0,asm,12005+22001,1509.9256 +gfx936,int8_w8a8_channel,torch.float16,2560,128,7168,256,8,0,0,asm,12005+22001,1624.4163 +gfx936,int8_w8a8_channel,torch.float16,2816,128,7168,256,8,0,0,asm,12005+22001,1691.4089 +gfx936,int8_w8a8_channel,torch.float16,3072,128,7168,256,8,0,0,asm,13000+23001,1756.3363 +gfx936,int8_w8a8_channel,torch.float16,3328,128,7168,256,8,0,0,asm,13000+23001,1796.1869 +gfx936,int8_w8a8_channel,torch.float16,3584,128,7168,256,8,0,0,asm,12005+22001,1918.564 +gfx936,int8_w8a8_channel,torch.float16,3840,128,7168,256,8,0,0,asm,12005+22001,2043.4429 +gfx936,int8_w8a8_channel,torch.float16,4096,128,7168,256,8,0,0,asm,12005+22001,2154.6243 +gfx936,int8_w8a8_channel,torch.float16,4608,128,7168,256,8,0,0,asm,12005+22001,2425.8021 +gfx936,int8_w8a8_channel,torch.float16,5120,128,7168,256,8,0,0,asm,12001+22001,2591.2525 +gfx936,int8_w8a8_channel,torch.float16,5632,128,7168,256,8,0,0,asm,12005+22001,2774.3757 +gfx936,int8_w8a8_channel,torch.float16,6144,128,7168,256,8,0,0,asm,13001+23001,3007.5207 +gfx936,int8_w8a8_channel,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23001,3109.7569 +gfx936,int8_w8a8_channel,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23001,3259.6305 +gfx936,int8_w8a8_channel,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23001,3473.4205 +gfx936,int8_w8a8_channel,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23001,3792.1285 +gfx936,int8_w8a8_channel,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23001,4587.977 +gfx936,int8_w8a8_channel,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23001,5386.7393 +gfx936,int8_w8a8_channel,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23001,6132.057 +gfx936,int8_w8a8_channel,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23001,6980.8374 +gfx936,int8_w8a8_channel,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23001,7425.3098 +gfx936,int8_w8a8_channel,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23001,10095.1451 +gfx936,int8_w8a8_channel,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23001,13283.6878 +gfx936,int8_w8a8_channel,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15384.0472 +gfx936,int8_w8a8_channel,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23101,19007.0848 +gfx936,int8_w8a8_channel,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23001,22780.6062 +gfx936,int8_w8a8_channel,torch.float16,1,256,7168,256,8,0,0,asm,10002+20001,75.0987 +gfx936,int8_w8a8_channel,torch.float16,2,256,7168,256,8,0,0,asm,10010+20001,112.2721 +gfx936,int8_w8a8_channel,torch.float16,3,256,7168,256,8,0,0,asm,10007+20001,152.3008 +gfx936,int8_w8a8_channel,torch.float16,4,256,7168,256,8,0,0,asm,10004+20001,192.0999 +gfx936,int8_w8a8_channel,torch.float16,5,256,7168,256,8,0,0,asm,10001+20001,217.7121 +gfx936,int8_w8a8_channel,torch.float16,6,256,7168,256,8,0,0,asm,10004+20001,245.6376 +gfx936,int8_w8a8_channel,torch.float16,7,256,7168,256,8,0,0,asm,10001+20001,273.5973 +gfx936,int8_w8a8_channel,torch.float16,8,256,7168,256,8,0,0,asm,10001+20001,294.147 +gfx936,int8_w8a8_channel,torch.float16,9,256,7168,256,8,0,0,asm,10005+20001,320.9195 +gfx936,int8_w8a8_channel,torch.float16,10,256,7168,256,8,0,0,asm,10001+20001,344.6414 +gfx936,int8_w8a8_channel,torch.float16,11,256,7168,256,8,0,0,asm,10001+20001,367.9973 +gfx936,int8_w8a8_channel,torch.float16,12,256,7168,256,8,0,0,asm,10001+20001,384.5183 +gfx936,int8_w8a8_channel,torch.float16,13,256,7168,256,8,0,0,asm,10001+20001,404.5027 +gfx936,int8_w8a8_channel,torch.float16,14,256,7168,256,8,0,0,asm,10001+20001,432.7809 +gfx936,int8_w8a8_channel,torch.float16,15,256,7168,256,8,0,0,asm,10001+20001,449.9427 +gfx936,int8_w8a8_channel,torch.float16,16,256,7168,256,8,0,0,asm,10001+20001,461.8845 +gfx936,int8_w8a8_channel,torch.float16,17,256,7168,256,8,0,0,asm,10001+20001,474.664 +gfx936,int8_w8a8_channel,torch.float16,18,256,7168,256,8,0,0,asm,10001+20001,494.5292 +gfx936,int8_w8a8_channel,torch.float16,20,256,7168,256,8,0,0,asm,10001+20001,535.7401 +gfx936,int8_w8a8_channel,torch.float16,24,256,7168,256,8,0,0,asm,10001+20001,613.1227 +gfx936,int8_w8a8_channel,torch.float16,28,256,7168,256,8,0,0,asm,10001+20001,697.9555 +gfx936,int8_w8a8_channel,torch.float16,32,256,7168,256,8,0,0,asm,10001+20001,745.6344 +gfx936,int8_w8a8_channel,torch.float16,34,256,7168,256,8,0,0,asm,10000+20001,756.038 +gfx936,int8_w8a8_channel,torch.float16,36,256,7168,256,8,0,0,asm,10001+20001,784.6725 +gfx936,int8_w8a8_channel,torch.float16,40,256,7168,256,8,0,0,asm,10000+20001,815.961 +gfx936,int8_w8a8_channel,torch.float16,44,256,7168,256,8,0,0,asm,10000+20001,853.4527 +gfx936,int8_w8a8_channel,torch.float16,48,256,7168,256,8,0,0,asm,10000+20001,874.2259 +gfx936,int8_w8a8_channel,torch.float16,56,256,7168,256,8,0,0,asm,10000+20001,920.5861 +gfx936,int8_w8a8_channel,torch.float16,64,256,7168,256,8,0,0,asm,10000+20001,957.2705 +gfx936,int8_w8a8_channel,torch.float16,68,256,7168,256,8,0,0,asm,10000+20001,966.4684 +gfx936,int8_w8a8_channel,torch.float16,72,256,7168,256,8,0,0,asm,10000+20001,993.4945 +gfx936,int8_w8a8_channel,torch.float16,80,256,7168,256,8,0,0,asm,10000+20001,1024.3243 +gfx936,int8_w8a8_channel,torch.float16,88,256,7168,256,8,0,0,asm,10000+20001,1060.0685 +gfx936,int8_w8a8_channel,torch.float16,96,256,7168,256,8,0,0,asm,10000+20001,1075.8352 +gfx936,int8_w8a8_channel,torch.float16,104,256,7168,256,8,0,0,asm,10000+20001,1088.9588 +gfx936,int8_w8a8_channel,torch.float16,112,256,7168,256,8,0,0,asm,10000+20001,1100.0135 +gfx936,int8_w8a8_channel,torch.float16,128,256,7168,256,8,0,0,asm,10000+20001,1124.4242 +gfx936,int8_w8a8_channel,torch.float16,144,256,7168,256,8,0,0,asm,12004+22001,1140.4953 +gfx936,int8_w8a8_channel,torch.float16,160,256,7168,256,8,0,0,asm,12004+22001,1153.8942 +gfx936,int8_w8a8_channel,torch.float16,192,256,7168,256,8,0,0,asm,12004+22001,1170.6655 +gfx936,int8_w8a8_channel,torch.float16,224,256,7168,256,8,0,0,asm,12004+22001,1176.2023 +gfx936,int8_w8a8_channel,torch.float16,256,256,7168,256,8,0,0,asm,12004+22001,1188.1461 +gfx936,int8_w8a8_channel,torch.float16,320,256,7168,256,8,0,0,asm,12004+22001,1215.3141 +gfx936,int8_w8a8_channel,torch.float16,384,256,7168,256,8,0,0,asm,12004+22001,1230.5589 +gfx936,int8_w8a8_channel,torch.float16,448,256,7168,256,8,0,0,asm,12004+22001,1253.6801 +gfx936,int8_w8a8_channel,torch.float16,512,256,7168,256,8,0,0,asm,12004+22001,1274.6525 +gfx936,int8_w8a8_channel,torch.float16,576,256,7168,256,8,0,0,asm,12004+22001,1303.6342 +gfx936,int8_w8a8_channel,torch.float16,640,256,7168,256,8,0,0,asm,12004+22001,1324.7529 +gfx936,int8_w8a8_channel,torch.float16,704,256,7168,256,8,0,0,asm,12004+22001,1346.4485 +gfx936,int8_w8a8_channel,torch.float16,768,256,7168,256,8,0,0,asm,12004+22001,1371.3286 +gfx936,int8_w8a8_channel,torch.float16,832,256,7168,256,8,0,0,asm,12004+22001,1381.5703 +gfx936,int8_w8a8_channel,torch.float16,896,256,7168,256,8,0,0,asm,12004+22001,1413.5563 +gfx936,int8_w8a8_channel,torch.float16,960,256,7168,256,8,0,0,asm,12004+22001,1426.0749 +gfx936,int8_w8a8_channel,torch.float16,1024,256,7168,256,8,0,0,asm,12004+22001,1459.5109 +gfx936,int8_w8a8_channel,torch.float16,1152,256,7168,256,8,0,0,asm,12004+22001,1489.2666 +gfx936,int8_w8a8_channel,torch.float16,1280,256,7168,256,8,0,0,asm,12004+22001,1551.0266 +gfx936,int8_w8a8_channel,torch.float16,1408,256,7168,256,8,0,0,asm,12004+22001,1581.6204 +gfx936,int8_w8a8_channel,torch.float16,1536,256,7168,256,8,0,0,asm,12004+22001,1624.9426 +gfx936,int8_w8a8_channel,torch.float16,1664,256,7168,256,8,0,0,asm,12004+22001,1673.3162 +gfx936,int8_w8a8_channel,torch.float16,1792,256,7168,256,8,0,0,asm,12004+22001,1750.1469 +gfx936,int8_w8a8_channel,torch.float16,1920,256,7168,256,8,0,0,asm,12004+22001,1815.5911 +gfx936,int8_w8a8_channel,torch.float16,2048,256,7168,256,8,0,0,asm,12004+22001,1913.0902 +gfx936,int8_w8a8_channel,torch.float16,2304,256,7168,256,8,0,0,asm,12005+22001,2068.4439 +gfx936,int8_w8a8_channel,torch.float16,2560,256,7168,256,8,0,0,asm,12005+22001,2179.9266 +gfx936,int8_w8a8_channel,torch.float16,2816,256,7168,256,8,0,0,asm,12005+22001,2261.0226 +gfx936,int8_w8a8_channel,torch.float16,3072,256,7168,256,8,0,0,asm,12005+22001,2341.2846 +gfx936,int8_w8a8_channel,torch.float16,3328,256,7168,256,8,0,0,asm,12005+22001,2433.3414 +gfx936,int8_w8a8_channel,torch.float16,3584,256,7168,256,8,0,0,asm,12005+22001,2551.62 +gfx936,int8_w8a8_channel,torch.float16,3840,256,7168,256,8,0,0,asm,12005+22001,2685.2969 +gfx936,int8_w8a8_channel,torch.float16,4096,256,7168,256,8,0,0,asm,12005+22001,2888.1218 +gfx936,int8_w8a8_channel,torch.float16,4608,256,7168,256,8,0,0,asm,12005+22001,3288.5242 +gfx936,int8_w8a8_channel,torch.float16,5120,256,7168,256,8,0,0,asm,12001+22001,3496.5502 +gfx936,int8_w8a8_channel,torch.float16,5632,256,7168,256,8,0,0,asm,12005+22001,3709.4328 +gfx936,int8_w8a8_channel,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23001,3861.1458 +gfx936,int8_w8a8_channel,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23001,3971.802 +gfx936,int8_w8a8_channel,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23001,4122.667 +gfx936,int8_w8a8_channel,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23001,4365.1053 +gfx936,int8_w8a8_channel,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23001,4820.6556 +gfx936,int8_w8a8_channel,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23001,5935.9992 +gfx936,int8_w8a8_channel,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23001,6847.5073 +gfx936,int8_w8a8_channel,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23001,7958.7381 +gfx936,int8_w8a8_channel,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23001,8817.8405 +gfx936,int8_w8a8_channel,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23001,9465.1795 +gfx936,int8_w8a8_channel,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23001,12940.1163 +gfx936,int8_w8a8_channel,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23001,17049.7543 +gfx936,int8_w8a8_channel,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23001,21102.088 +gfx936,int8_w8a8_channel,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23001,25129.0387 +gfx938,int8_w8a8_channel,torch.float16,1,128,7168,256,8,0,0,asm,10000+20101,61.5548 +gfx938,int8_w8a8_channel,torch.float16,2,128,7168,256,8,0,0,asm,10006+20102,76.2512 +gfx938,int8_w8a8_channel,torch.float16,3,128,7168,256,8,0,0,asm,10008+20102,83.7998 +gfx938,int8_w8a8_channel,torch.float16,4,128,7168,256,8,0,0,asm,10008+20102,102.0971 +gfx938,int8_w8a8_channel,torch.float16,5,128,7168,256,8,0,0,asm,10011+20102,125.0755 +gfx938,int8_w8a8_channel,torch.float16,6,128,7168,256,8,0,0,asm,10011+20102,130.7657 +gfx938,int8_w8a8_channel,torch.float16,7,128,7168,256,8,0,0,asm,10011+20102,138.7698 +gfx938,int8_w8a8_channel,torch.float16,8,128,7168,256,8,0,0,asm,10011+20102,143.9185 +gfx938,int8_w8a8_channel,torch.float16,9,128,7168,256,8,0,0,asm,10008+20102,156.7554 +gfx938,int8_w8a8_channel,torch.float16,10,128,7168,256,8,0,0,asm,10008+20102,169.7235 +gfx938,int8_w8a8_channel,torch.float16,11,128,7168,256,8,0,0,asm,10007+20102,196.9235 +gfx938,int8_w8a8_channel,torch.float16,12,128,7168,256,8,0,0,asm,10006+20102,197.9408 +gfx938,int8_w8a8_channel,torch.float16,13,128,7168,256,8,0,0,asm,10006+20102,202.0065 +gfx938,int8_w8a8_channel,torch.float16,14,128,7168,256,8,0,0,asm,10008+20102,207.7829 +gfx938,int8_w8a8_channel,torch.float16,15,128,7168,256,8,0,0,asm,10008+20102,211.6187 +gfx938,int8_w8a8_channel,torch.float16,16,128,7168,256,8,0,0,asm,10008+20102,215.549 +gfx938,int8_w8a8_channel,torch.float16,17,128,7168,256,8,0,0,asm,10008+20102,216.2229 +gfx938,int8_w8a8_channel,torch.float16,18,128,7168,256,8,0,0,asm,10008+20102,225.1172 +gfx938,int8_w8a8_channel,torch.float16,20,128,7168,256,8,0,0,asm,10007+20102,252.6248 +gfx938,int8_w8a8_channel,torch.float16,24,128,7168,256,8,0,0,asm,10009+20102,270.0401 +gfx938,int8_w8a8_channel,torch.float16,28,128,7168,256,8,0,0,asm,10006+20102,322.4983 +gfx938,int8_w8a8_channel,torch.float16,32,128,7168,256,8,0,0,asm,10009+20102,333.6121 +gfx938,int8_w8a8_channel,torch.float16,34,128,7168,256,8,0,0,asm,10008+20102,326.2245 +gfx938,int8_w8a8_channel,torch.float16,36,128,7168,256,8,0,0,asm,10008+20102,336.0665 +gfx938,int8_w8a8_channel,torch.float16,40,128,7168,256,8,0,0,asm,10006+20102,360.2305 +gfx938,int8_w8a8_channel,torch.float16,44,128,7168,256,8,0,0,asm,10006+20102,370.0438 +gfx938,int8_w8a8_channel,torch.float16,48,128,7168,256,8,0,0,asm,10008+20102,376.5956 +gfx938,int8_w8a8_channel,torch.float16,56,128,7168,256,8,0,0,asm,10009+20102,390.4745 +gfx938,int8_w8a8_channel,torch.float16,64,128,7168,256,8,0,0,asm,10006+20102,412.9934 +gfx938,int8_w8a8_channel,torch.float16,68,128,7168,256,8,0,0,asm,10007+20102,438.704 +gfx938,int8_w8a8_channel,torch.float16,72,128,7168,256,8,0,0,asm,10007+20102,444.9891 +gfx938,int8_w8a8_channel,torch.float16,80,128,7168,256,8,0,0,asm,10006+20001,460.9659 +gfx938,int8_w8a8_channel,torch.float16,88,128,7168,256,8,0,0,asm,10008+20102,447.6596 +gfx938,int8_w8a8_channel,torch.float16,96,128,7168,256,8,0,0,asm,10008+20102,449.785 +gfx938,int8_w8a8_channel,torch.float16,104,128,7168,256,8,0,0,asm,10008+20102,456.8659 +gfx938,int8_w8a8_channel,torch.float16,112,128,7168,256,8,0,0,asm,10008+20102,459.0649 +gfx938,int8_w8a8_channel,torch.float16,128,128,7168,256,8,0,0,asm,10009+20102,469.3418 +gfx938,int8_w8a8_channel,torch.float16,144,128,7168,256,8,0,0,asm,10006+20102,487.4953 +gfx938,int8_w8a8_channel,torch.float16,160,128,7168,256,8,0,0,asm,10007+20102,494.2935 +gfx938,int8_w8a8_channel,torch.float16,192,128,7168,256,8,0,0,asm,10007+20102,503.7376 +gfx938,int8_w8a8_channel,torch.float16,224,128,7168,256,8,0,0,asm,10006+20102,507.8321 +gfx938,int8_w8a8_channel,torch.float16,256,128,7168,256,8,0,0,asm,10008+20102,520.7877 +gfx938,int8_w8a8_channel,torch.float16,320,128,7168,256,8,0,0,asm,10008+20102,539.2761 +gfx938,int8_w8a8_channel,torch.float16,384,128,7168,256,8,0,0,asm,10008+20102,544.8087 +gfx938,int8_w8a8_channel,torch.float16,448,128,7168,256,8,0,0,asm,10009+20102,573.4907 +gfx938,int8_w8a8_channel,torch.float16,512,128,7168,256,8,0,0,asm,10011+20102,596.1433 +gfx938,int8_w8a8_channel,torch.float16,576,128,7168,256,8,0,0,asm,11005+21102,656.0282 +gfx938,int8_w8a8_channel,torch.float16,640,128,7168,256,8,0,0,asm,11007+21102,635.9422 +gfx938,int8_w8a8_channel,torch.float16,704,128,7168,256,8,0,0,asm,11007+21102,645.1769 +gfx938,int8_w8a8_channel,torch.float16,768,128,7168,256,8,0,0,asm,11007+21102,657.2918 +gfx938,int8_w8a8_channel,torch.float16,832,128,7168,256,8,0,0,asm,11007+21102,672.0241 +gfx938,int8_w8a8_channel,torch.float16,896,128,7168,256,8,0,0,asm,11002+21102,704.436 +gfx938,int8_w8a8_channel,torch.float16,960,128,7168,256,8,0,0,asm,11007+21102,732.9018 +gfx938,int8_w8a8_channel,torch.float16,1024,128,7168,256,8,0,0,asm,11007+21102,771.3553 +gfx938,int8_w8a8_channel,torch.float16,1152,128,7168,256,8,0,0,asm,11007+21102,876.3761 +gfx938,int8_w8a8_channel,torch.float16,1280,128,7168,256,8,0,0,asm,11005+21102,874.5662 +gfx938,int8_w8a8_channel,torch.float16,1408,128,7168,256,8,0,0,asm,11007+21102,920.0681 +gfx938,int8_w8a8_channel,torch.float16,1536,128,7168,256,8,0,0,asm,11005+21102,1005.6719 +gfx938,int8_w8a8_channel,torch.float16,1664,128,7168,256,8,0,0,asm,11003+21102,1021.9835 +gfx938,int8_w8a8_channel,torch.float16,1792,128,7168,256,8,0,0,asm,11005+21102,1059.247 +gfx938,int8_w8a8_channel,torch.float16,1920,128,7168,256,8,0,0,asm,11005+21102,1114.7504 +gfx938,int8_w8a8_channel,torch.float16,2048,128,7168,256,8,0,0,asm,11005+21102,1168.9655 +gfx938,int8_w8a8_channel,torch.float16,2304,128,7168,256,8,0,0,asm,11003+21102,1312.8102 +gfx938,int8_w8a8_channel,torch.float16,2560,128,7168,256,8,0,0,asm,11005+21102,1403.0569 +gfx938,int8_w8a8_channel,torch.float16,2816,128,7168,256,8,0,0,asm,11005+21102,1498.835 +gfx938,int8_w8a8_channel,torch.float16,3072,128,7168,256,8,0,0,asm,11003+21102,1613.8125 +gfx938,int8_w8a8_channel,torch.float16,3328,128,7168,256,8,0,0,asm,13001+23001,1700.2946 +gfx938,int8_w8a8_channel,torch.float16,3584,128,7168,256,8,0,0,asm,13001+23001,1783.0265 +gfx938,int8_w8a8_channel,torch.float16,3840,128,7168,256,8,0,0,asm,12005+22101,1873.9472 +gfx938,int8_w8a8_channel,torch.float16,4096,128,7168,256,8,0,0,asm,12005+22101,2008.9948 +gfx938,int8_w8a8_channel,torch.float16,4608,128,7168,256,8,0,0,asm,11005+21102,2286.6552 +gfx938,int8_w8a8_channel,torch.float16,5120,128,7168,256,8,0,0,asm,12005+22101,2463.2891 +gfx938,int8_w8a8_channel,torch.float16,5632,128,7168,256,8,0,0,asm,12005+22101,2637.5543 +gfx938,int8_w8a8_channel,torch.float16,6144,128,7168,256,8,0,0,asm,12005+22101,2873.6969 +gfx938,int8_w8a8_channel,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23100,3049.9628 +gfx938,int8_w8a8_channel,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23101,3153.4823 +gfx938,int8_w8a8_channel,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23101,3301.7073 +gfx938,int8_w8a8_channel,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23101,3585.6404 +gfx938,int8_w8a8_channel,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23101,4373.7192 +gfx938,int8_w8a8_channel,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23101,5123.7714 +gfx938,int8_w8a8_channel,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23101,5923.3454 +gfx938,int8_w8a8_channel,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23101,6549.2382 +gfx938,int8_w8a8_channel,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23101,7048.0915 +gfx938,int8_w8a8_channel,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23101,9605.6151 +gfx938,int8_w8a8_channel,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23101,12610.251 +gfx938,int8_w8a8_channel,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,14891.5836 +gfx938,int8_w8a8_channel,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23101,17871.5385 +gfx938,int8_w8a8_channel,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23101,20962.4344 +gfx938,int8_w8a8_channel,torch.float16,65536,128,7168,256,8,0,0,asm,13001+23101,24165.9529 +gfx938,int8_w8a8_channel,torch.float16,1,256,7168,256,8,0,0,asm,10006+20001,68.5505 +gfx938,int8_w8a8_channel,torch.float16,2,256,7168,256,8,0,0,asm,10008+20001,100.8147 +gfx938,int8_w8a8_channel,torch.float16,3,256,7168,256,8,0,0,asm,10011+20001,133.5213 +gfx938,int8_w8a8_channel,torch.float16,4,256,7168,256,8,0,0,asm,10011+20001,160.5774 +gfx938,int8_w8a8_channel,torch.float16,5,256,7168,256,8,0,0,asm,10007+20001,190.2981 +gfx938,int8_w8a8_channel,torch.float16,6,256,7168,256,8,0,0,asm,10008+20001,201.7628 +gfx938,int8_w8a8_channel,torch.float16,7,256,7168,256,8,0,0,asm,10008+20001,221.2054 +gfx938,int8_w8a8_channel,torch.float16,8,256,7168,256,8,0,0,asm,10011+20001,237.7383 +gfx938,int8_w8a8_channel,torch.float16,9,256,7168,256,8,0,0,asm,10011+20001,256.8013 +gfx938,int8_w8a8_channel,torch.float16,10,256,7168,256,8,0,0,asm,10008+20001,272.7689 +gfx938,int8_w8a8_channel,torch.float16,11,256,7168,256,8,0,0,asm,10006+20001,300.0406 +gfx938,int8_w8a8_channel,torch.float16,12,256,7168,256,8,0,0,asm,10008+20001,305.9437 +gfx938,int8_w8a8_channel,torch.float16,13,256,7168,256,8,0,0,asm,10008+20001,316.1973 +gfx938,int8_w8a8_channel,torch.float16,14,256,7168,256,8,0,0,asm,10011+20001,339.9413 +gfx938,int8_w8a8_channel,torch.float16,15,256,7168,256,8,0,0,asm,10011+20001,351.3606 +gfx938,int8_w8a8_channel,torch.float16,16,256,7168,256,8,0,0,asm,10011+20001,357.4739 +gfx938,int8_w8a8_channel,torch.float16,17,256,7168,256,8,0,0,asm,10008+20001,361.1618 +gfx938,int8_w8a8_channel,torch.float16,18,256,7168,256,8,0,0,asm,10008+20001,374.265 +gfx938,int8_w8a8_channel,torch.float16,20,256,7168,256,8,0,0,asm,10008+20001,401.4923 +gfx938,int8_w8a8_channel,torch.float16,24,256,7168,256,8,0,0,asm,10008+20001,453.172 +gfx938,int8_w8a8_channel,torch.float16,28,256,7168,256,8,0,0,asm,10006+20001,517.5313 +gfx938,int8_w8a8_channel,torch.float16,32,256,7168,256,8,0,0,asm,10008+20001,544.9005 +gfx938,int8_w8a8_channel,torch.float16,34,256,7168,256,8,0,0,asm,10008+20001,544.9203 +gfx938,int8_w8a8_channel,torch.float16,36,256,7168,256,8,0,0,asm,10008+20001,560.8699 +gfx938,int8_w8a8_channel,torch.float16,40,256,7168,256,8,0,0,asm,10006+20001,585.8207 +gfx938,int8_w8a8_channel,torch.float16,44,256,7168,256,8,0,0,asm,10009+20001,609.8321 +gfx938,int8_w8a8_channel,torch.float16,48,256,7168,256,8,0,0,asm,10008+20001,621.0491 +gfx938,int8_w8a8_channel,torch.float16,56,256,7168,256,8,0,0,asm,10008+20001,655.6489 +gfx938,int8_w8a8_channel,torch.float16,64,256,7168,256,8,0,0,asm,10008+20001,678.5654 +gfx938,int8_w8a8_channel,torch.float16,68,256,7168,256,8,0,0,asm,10006+20001,704.5449 +gfx938,int8_w8a8_channel,torch.float16,72,256,7168,256,8,0,0,asm,10006+20001,699.4441 +gfx938,int8_w8a8_channel,torch.float16,80,256,7168,256,8,0,0,asm,10008+20001,720.0905 +gfx938,int8_w8a8_channel,torch.float16,88,256,7168,256,8,0,0,asm,10006+20001,750.3095 +gfx938,int8_w8a8_channel,torch.float16,96,256,7168,256,8,0,0,asm,10006+20001,754.8377 +gfx938,int8_w8a8_channel,torch.float16,104,256,7168,256,8,0,0,asm,10008+20001,773.6645 +gfx938,int8_w8a8_channel,torch.float16,112,256,7168,256,8,0,0,asm,10008+20001,774.8759 +gfx938,int8_w8a8_channel,torch.float16,128,256,7168,256,8,0,0,asm,10008+20001,789.6589 +gfx938,int8_w8a8_channel,torch.float16,144,256,7168,256,8,0,0,asm,10008+20001,797.6933 +gfx938,int8_w8a8_channel,torch.float16,160,256,7168,256,8,0,0,asm,10008+20001,807.9442 +gfx938,int8_w8a8_channel,torch.float16,192,256,7168,256,8,0,0,asm,10008+20001,820.3015 +gfx938,int8_w8a8_channel,torch.float16,224,256,7168,256,8,0,0,asm,10008+20001,830.4963 +gfx938,int8_w8a8_channel,torch.float16,256,256,7168,256,8,0,0,asm,10008+20001,845.8104 +gfx938,int8_w8a8_channel,torch.float16,320,256,7168,256,8,0,0,asm,10009+20001,880.0833 +gfx938,int8_w8a8_channel,torch.float16,384,256,7168,256,8,0,0,asm,10011+20001,907.5741 +gfx938,int8_w8a8_channel,torch.float16,448,256,7168,256,8,0,0,asm,11004+21001,925.8605 +gfx938,int8_w8a8_channel,torch.float16,512,256,7168,256,8,0,0,asm,11004+21001,949.4826 +gfx938,int8_w8a8_channel,torch.float16,576,256,7168,256,8,0,0,asm,11004+21001,1004.8272 +gfx938,int8_w8a8_channel,torch.float16,640,256,7168,256,8,0,0,asm,11004+21001,981.5911 +gfx938,int8_w8a8_channel,torch.float16,704,256,7168,256,8,0,0,asm,11004+21001,1006.4282 +gfx938,int8_w8a8_channel,torch.float16,768,256,7168,256,8,0,0,asm,11007+21001,1028.9819 +gfx938,int8_w8a8_channel,torch.float16,832,256,7168,256,8,0,0,asm,11007+21001,1062.8699 +gfx938,int8_w8a8_channel,torch.float16,896,256,7168,256,8,0,0,asm,11005+21001,1111.3783 +gfx938,int8_w8a8_channel,torch.float16,960,256,7168,256,8,0,0,asm,11005+21001,1172.5677 +gfx938,int8_w8a8_channel,torch.float16,1024,256,7168,256,8,0,0,asm,11005+21001,1198.2545 +gfx938,int8_w8a8_channel,torch.float16,1152,256,7168,256,8,0,0,asm,12005+22001,1312.7485 +gfx938,int8_w8a8_channel,torch.float16,1280,256,7168,256,8,0,0,asm,12003+22001,1305.7395 +gfx938,int8_w8a8_channel,torch.float16,1408,256,7168,256,8,0,0,asm,12005+22001,1344.1193 +gfx938,int8_w8a8_channel,torch.float16,1536,256,7168,256,8,0,0,asm,12000+22001,1382.4336 +gfx938,int8_w8a8_channel,torch.float16,1664,256,7168,256,8,0,0,asm,12005+22001,1410.499 +gfx938,int8_w8a8_channel,torch.float16,1792,256,7168,256,8,0,0,asm,12001+22001,1492.3818 +gfx938,int8_w8a8_channel,torch.float16,1920,256,7168,256,8,0,0,asm,12005+22001,1537.2226 +gfx938,int8_w8a8_channel,torch.float16,2048,256,7168,256,8,0,0,asm,12005+22001,1697.7758 +gfx938,int8_w8a8_channel,torch.float16,2304,256,7168,256,8,0,0,asm,12005+22001,1956.1871 +gfx938,int8_w8a8_channel,torch.float16,2560,256,7168,256,8,0,0,asm,13001+23001,2038.5128 +gfx938,int8_w8a8_channel,torch.float16,2816,256,7168,256,8,0,0,asm,13001+23001,2095.8748 +gfx938,int8_w8a8_channel,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23001,2151.5135 +gfx938,int8_w8a8_channel,torch.float16,3328,256,7168,256,8,0,0,asm,13001+23001,2219.1767 +gfx938,int8_w8a8_channel,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23001,2287.4431 +gfx938,int8_w8a8_channel,torch.float16,3840,256,7168,256,8,0,0,asm,13001+23001,2475.5367 +gfx938,int8_w8a8_channel,torch.float16,4096,256,7168,256,8,0,0,asm,12005+22001,2775.478 +gfx938,int8_w8a8_channel,torch.float16,4608,256,7168,256,8,0,0,asm,12001+22001,3167.0785 +gfx938,int8_w8a8_channel,torch.float16,5120,256,7168,256,8,0,0,asm,12005+22001,3359.1433 +gfx938,int8_w8a8_channel,torch.float16,5632,256,7168,256,8,0,0,asm,12001+22001,3549.321 +gfx938,int8_w8a8_channel,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23001,3848.2614 +gfx938,int8_w8a8_channel,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23001,3960.4866 +gfx938,int8_w8a8_channel,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23001,4072.7199 +gfx938,int8_w8a8_channel,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23001,4313.7529 +gfx938,int8_w8a8_channel,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23001,4792.6391 +gfx938,int8_w8a8_channel,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23001,5855.5025 +gfx938,int8_w8a8_channel,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23001,6805.8167 +gfx938,int8_w8a8_channel,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23001,7873.4718 +gfx938,int8_w8a8_channel,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23001,8847.507 +gfx938,int8_w8a8_channel,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23001,9618.6136 +gfx938,int8_w8a8_channel,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23001,12937.4104 +gfx938,int8_w8a8_channel,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23001,16973.6735 +gfx938,int8_w8a8_channel,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23001,21009.7928 +gfx938,int8_w8a8_channel,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23001,24983.1394 +gfx938,int8_w8a8_channel,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23001,29099.3916 +gfx938,int8_w8a8_channel,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23001,33141.3321 +gfx938,f8_w8a8_channel,torch.float16,1,128,7168,256,8,0,0,asm,10000+20101,63.2993 +gfx938,f8_w8a8_channel,torch.float16,2,128,7168,256,8,0,0,asm,10002+20102,75.3779 +gfx938,f8_w8a8_channel,torch.float16,3,128,7168,256,8,0,0,asm,10008+20102,83.7752 +gfx938,f8_w8a8_channel,torch.float16,4,128,7168,256,8,0,0,asm,10002+20102,102.3926 +gfx938,f8_w8a8_channel,torch.float16,5,128,7168,256,8,0,0,asm,10011+20102,124.0745 +gfx938,f8_w8a8_channel,torch.float16,6,128,7168,256,8,0,0,asm,10011+20102,132.456 +gfx938,f8_w8a8_channel,torch.float16,7,128,7168,256,8,0,0,asm,10011+20102,141.4159 +gfx938,f8_w8a8_channel,torch.float16,8,128,7168,256,8,0,0,asm,10008+20102,145.7236 +gfx938,f8_w8a8_channel,torch.float16,9,128,7168,256,8,0,0,asm,10008+20102,155.8406 +gfx938,f8_w8a8_channel,torch.float16,10,128,7168,256,8,0,0,asm,10009+20102,169.2886 +gfx938,f8_w8a8_channel,torch.float16,11,128,7168,256,8,0,0,asm,10006+20102,195.3932 +gfx938,f8_w8a8_channel,torch.float16,12,128,7168,256,8,0,0,asm,10006+20102,197.5265 +gfx938,f8_w8a8_channel,torch.float16,13,128,7168,256,8,0,0,asm,10006+20102,202.384 +gfx938,f8_w8a8_channel,torch.float16,14,128,7168,256,8,0,0,asm,10009+20102,208.2793 +gfx938,f8_w8a8_channel,torch.float16,15,128,7168,256,8,0,0,asm,10008+20102,211.4998 +gfx938,f8_w8a8_channel,torch.float16,16,128,7168,256,8,0,0,asm,10008+20102,216.0495 +gfx938,f8_w8a8_channel,torch.float16,17,128,7168,256,8,0,0,asm,10009+20102,217.7286 +gfx938,f8_w8a8_channel,torch.float16,18,128,7168,256,8,0,0,asm,10008+20102,226.1962 +gfx938,f8_w8a8_channel,torch.float16,20,128,7168,256,8,0,0,asm,10011+20102,250.2782 +gfx938,f8_w8a8_channel,torch.float16,24,128,7168,256,8,0,0,asm,10008+20102,269.9047 +gfx938,f8_w8a8_channel,torch.float16,28,128,7168,256,8,0,0,asm,10006+20102,324.0327 +gfx938,f8_w8a8_channel,torch.float16,32,128,7168,256,8,0,0,asm,10008+20102,332.3731 +gfx938,f8_w8a8_channel,torch.float16,34,128,7168,256,8,0,0,asm,10008+20102,323.6891 +gfx938,f8_w8a8_channel,torch.float16,36,128,7168,256,8,0,0,asm,10008+20102,335.5496 +gfx938,f8_w8a8_channel,torch.float16,40,128,7168,256,8,0,0,asm,10006+20102,360.0131 +gfx938,f8_w8a8_channel,torch.float16,44,128,7168,256,8,0,0,asm,10006+20102,368.4807 +gfx938,f8_w8a8_channel,torch.float16,48,128,7168,256,8,0,0,asm,10006+20102,373.211 +gfx938,f8_w8a8_channel,torch.float16,56,128,7168,256,8,0,0,asm,10008+20102,387.0038 +gfx938,f8_w8a8_channel,torch.float16,64,128,7168,256,8,0,0,asm,10006+20102,412.3411 +gfx938,f8_w8a8_channel,torch.float16,68,128,7168,256,8,0,0,asm,10009+20102,427.3523 +gfx938,f8_w8a8_channel,torch.float16,72,128,7168,256,8,0,0,asm,10007+20102,429.8217 +gfx938,f8_w8a8_channel,torch.float16,80,128,7168,256,8,0,0,asm,10006+20102,436.6703 +gfx938,f8_w8a8_channel,torch.float16,88,128,7168,256,8,0,0,asm,10009+20102,450.7544 +gfx938,f8_w8a8_channel,torch.float16,96,128,7168,256,8,0,0,asm,10008+20102,448.3286 +gfx938,f8_w8a8_channel,torch.float16,104,128,7168,256,8,0,0,asm,10008+20102,455.746 +gfx938,f8_w8a8_channel,torch.float16,112,128,7168,256,8,0,0,asm,10008+20102,460.9111 +gfx938,f8_w8a8_channel,torch.float16,128,128,7168,256,8,0,0,asm,10008+20102,473.5223 +gfx938,f8_w8a8_channel,torch.float16,144,128,7168,256,8,0,0,asm,10006+20102,489.0501 +gfx938,f8_w8a8_channel,torch.float16,160,128,7168,256,8,0,0,asm,10006+20102,494.0089 +gfx938,f8_w8a8_channel,torch.float16,192,128,7168,256,8,0,0,asm,10007+20102,503.0324 +gfx938,f8_w8a8_channel,torch.float16,224,128,7168,256,8,0,0,asm,10007+20102,509.8447 +gfx938,f8_w8a8_channel,torch.float16,256,128,7168,256,8,0,0,asm,10006+20102,519.6637 +gfx938,f8_w8a8_channel,torch.float16,320,128,7168,256,8,0,0,asm,10011+20102,541.1159 +gfx938,f8_w8a8_channel,torch.float16,384,128,7168,256,8,0,0,asm,10008+20102,544.4964 +gfx938,f8_w8a8_channel,torch.float16,448,128,7168,256,8,0,0,asm,10008+20102,571.483 +gfx938,f8_w8a8_channel,torch.float16,512,128,7168,256,8,0,0,asm,10011+20102,594.417 +gfx938,f8_w8a8_channel,torch.float16,576,128,7168,256,8,0,0,asm,11007+21102,655.6798 +gfx938,f8_w8a8_channel,torch.float16,640,128,7168,256,8,0,0,asm,11007+21102,642.2806 +gfx938,f8_w8a8_channel,torch.float16,704,128,7168,256,8,0,0,asm,11005+21102,654.5883 +gfx938,f8_w8a8_channel,torch.float16,768,128,7168,256,8,0,0,asm,11007+21102,664.4925 +gfx938,f8_w8a8_channel,torch.float16,832,128,7168,256,8,0,0,asm,11002+21102,675.4143 +gfx938,f8_w8a8_channel,torch.float16,896,128,7168,256,8,0,0,asm,11007+21102,724.4094 +gfx938,f8_w8a8_channel,torch.float16,960,128,7168,256,8,0,0,asm,11007+21102,733.8166 +gfx938,f8_w8a8_channel,torch.float16,1024,128,7168,256,8,0,0,asm,11005+21102,792.8112 +gfx938,f8_w8a8_channel,torch.float16,1152,128,7168,256,8,0,0,asm,11007+21102,889.0462 +gfx938,f8_w8a8_channel,torch.float16,1280,128,7168,256,8,0,0,asm,11007+21102,898.7677 +gfx938,f8_w8a8_channel,torch.float16,1408,128,7168,256,8,0,0,asm,11007+21102,928.2732 +gfx938,f8_w8a8_channel,torch.float16,1536,128,7168,256,8,0,0,asm,12005+22001,989.7838 +gfx938,f8_w8a8_channel,torch.float16,1664,128,7168,256,8,0,0,asm,11007+21102,1037.5527 +gfx938,f8_w8a8_channel,torch.float16,1792,128,7168,256,8,0,0,asm,11005+21102,1076.8675 +gfx938,f8_w8a8_channel,torch.float16,1920,128,7168,256,8,0,0,asm,11005+21102,1136.2149 +gfx938,f8_w8a8_channel,torch.float16,2048,128,7168,256,8,0,0,asm,11005+21102,1185.9542 +gfx938,f8_w8a8_channel,torch.float16,2304,128,7168,256,8,0,0,asm,11005+21102,1348.5463 +gfx938,f8_w8a8_channel,torch.float16,2560,128,7168,256,8,0,0,asm,11005+21102,1420.4804 +gfx938,f8_w8a8_channel,torch.float16,2816,128,7168,256,8,0,0,asm,11005+21102,1525.0421 +gfx938,f8_w8a8_channel,torch.float16,3072,128,7168,256,8,0,0,asm,13001+23001,1643.3144 +gfx938,f8_w8a8_channel,torch.float16,3328,128,7168,256,8,0,0,asm,13001+23001,1692.1798 +gfx938,f8_w8a8_channel,torch.float16,3584,128,7168,256,8,0,0,asm,13001+23001,1774.5672 +gfx938,f8_w8a8_channel,torch.float16,3840,128,7168,256,8,0,0,asm,12005+22101,1901.4262 +gfx938,f8_w8a8_channel,torch.float16,4096,128,7168,256,8,0,0,asm,12005+22101,2048.3671 +gfx938,f8_w8a8_channel,torch.float16,4608,128,7168,256,8,0,0,asm,11005+21102,2313.0716 +gfx938,f8_w8a8_channel,torch.float16,5120,128,7168,256,8,0,0,asm,12001+22101,2501.2432 +gfx938,f8_w8a8_channel,torch.float16,5632,128,7168,256,8,0,0,asm,12005+22001,2673.3164 +gfx938,f8_w8a8_channel,torch.float16,6144,128,7168,256,8,0,0,asm,12005+22101,2910.5216 +gfx938,f8_w8a8_channel,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23001,3051.8871 +gfx938,f8_w8a8_channel,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23001,3169.9625 +gfx938,f8_w8a8_channel,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23001,3321.1865 +gfx938,f8_w8a8_channel,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23101,3639.3636 +gfx938,f8_w8a8_channel,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23101,4414.8517 +gfx938,f8_w8a8_channel,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23101,5191.0948 +gfx938,f8_w8a8_channel,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23100,6015.8587 +gfx938,f8_w8a8_channel,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23101,6643.1382 +gfx938,f8_w8a8_channel,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23101,7147.6777 +gfx938,f8_w8a8_channel,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23101,9742.9246 +gfx938,f8_w8a8_channel,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23101,12818.7074 +gfx938,f8_w8a8_channel,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15088.9874 +gfx938,f8_w8a8_channel,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23101,18123.5682 +gfx938,f8_w8a8_channel,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23101,21277.0495 +gfx938,f8_w8a8_channel,torch.float16,65536,128,7168,256,8,0,0,asm,13001+23101,24574.4229 +gfx938,f8_w8a8_channel,torch.float16,1,256,7168,256,8,0,0,asm,10002+20001,71.3773 +gfx938,f8_w8a8_channel,torch.float16,2,256,7168,256,8,0,0,asm,10008+20001,103.0325 +gfx938,f8_w8a8_channel,torch.float16,3,256,7168,256,8,0,0,asm,10011+20001,132.8016 +gfx938,f8_w8a8_channel,torch.float16,4,256,7168,256,8,0,0,asm,10008+20001,161.5924 +gfx938,f8_w8a8_channel,torch.float16,5,256,7168,256,8,0,0,asm,10006+20001,188.8424 +gfx938,f8_w8a8_channel,torch.float16,6,256,7168,256,8,0,0,asm,10008+20001,203.0139 +gfx938,f8_w8a8_channel,torch.float16,7,256,7168,256,8,0,0,asm,10008+20001,221.2909 +gfx938,f8_w8a8_channel,torch.float16,8,256,7168,256,8,0,0,asm,10011+20001,237.7794 +gfx938,f8_w8a8_channel,torch.float16,9,256,7168,256,8,0,0,asm,10011+20001,257.2902 +gfx938,f8_w8a8_channel,torch.float16,10,256,7168,256,8,0,0,asm,10011+20001,272.7781 +gfx938,f8_w8a8_channel,torch.float16,11,256,7168,256,8,0,0,asm,10006+20001,298.4969 +gfx938,f8_w8a8_channel,torch.float16,12,256,7168,256,8,0,0,asm,10009+20001,307.2374 +gfx938,f8_w8a8_channel,torch.float16,13,256,7168,256,8,0,0,asm,10008+20001,317.3495 +gfx938,f8_w8a8_channel,torch.float16,14,256,7168,256,8,0,0,asm,10011+20001,339.3927 +gfx938,f8_w8a8_channel,torch.float16,15,256,7168,256,8,0,0,asm,10011+20001,349.8841 +gfx938,f8_w8a8_channel,torch.float16,16,256,7168,256,8,0,0,asm,10011+20001,358.5241 +gfx938,f8_w8a8_channel,torch.float16,17,256,7168,256,8,0,0,asm,10009+20001,361.753 +gfx938,f8_w8a8_channel,torch.float16,18,256,7168,256,8,0,0,asm,10008+20001,375.0472 +gfx938,f8_w8a8_channel,torch.float16,20,256,7168,256,8,0,0,asm,10006+20001,403.993 +gfx938,f8_w8a8_channel,torch.float16,24,256,7168,256,8,0,0,asm,10008+20001,453.9264 +gfx938,f8_w8a8_channel,torch.float16,28,256,7168,256,8,0,0,asm,10008+20001,516.5211 +gfx938,f8_w8a8_channel,torch.float16,32,256,7168,256,8,0,0,asm,10009+20001,546.433 +gfx938,f8_w8a8_channel,torch.float16,34,256,7168,256,8,0,0,asm,10011+20001,547.1689 +gfx938,f8_w8a8_channel,torch.float16,36,256,7168,256,8,0,0,asm,10008+20001,562.6345 +gfx938,f8_w8a8_channel,torch.float16,40,256,7168,256,8,0,0,asm,10006+20001,586.7441 +gfx938,f8_w8a8_channel,torch.float16,44,256,7168,256,8,0,0,asm,10008+20001,609.3898 +gfx938,f8_w8a8_channel,torch.float16,48,256,7168,256,8,0,0,asm,10008+20001,623.9783 +gfx938,f8_w8a8_channel,torch.float16,56,256,7168,256,8,0,0,asm,10008+20001,655.9684 +gfx938,f8_w8a8_channel,torch.float16,64,256,7168,256,8,0,0,asm,10006+20001,680.2431 +gfx938,f8_w8a8_channel,torch.float16,68,256,7168,256,8,0,0,asm,10006+20001,686.086 +gfx938,f8_w8a8_channel,torch.float16,72,256,7168,256,8,0,0,asm,10012+20001,720.2646 +gfx938,f8_w8a8_channel,torch.float16,80,256,7168,256,8,0,0,asm,10008+20001,771.0738 +gfx938,f8_w8a8_channel,torch.float16,88,256,7168,256,8,0,0,asm,10008+20001,745.2486 +gfx938,f8_w8a8_channel,torch.float16,96,256,7168,256,8,0,0,asm,10006+20001,755.5114 +gfx938,f8_w8a8_channel,torch.float16,104,256,7168,256,8,0,0,asm,10008+20001,770.5044 +gfx938,f8_w8a8_channel,torch.float16,112,256,7168,256,8,0,0,asm,10008+20001,775.8668 +gfx938,f8_w8a8_channel,torch.float16,128,256,7168,256,8,0,0,asm,10008+20001,789.3307 +gfx938,f8_w8a8_channel,torch.float16,144,256,7168,256,8,0,0,asm,10008+20001,801.6573 +gfx938,f8_w8a8_channel,torch.float16,160,256,7168,256,8,0,0,asm,10008+20001,808.8431 +gfx938,f8_w8a8_channel,torch.float16,192,256,7168,256,8,0,0,asm,10008+20001,822.915 +gfx938,f8_w8a8_channel,torch.float16,224,256,7168,256,8,0,0,asm,10008+20001,833.1515 +gfx938,f8_w8a8_channel,torch.float16,256,256,7168,256,8,0,0,asm,10008+20001,843.5744 +gfx938,f8_w8a8_channel,torch.float16,320,256,7168,256,8,0,0,asm,10008+20001,875.8173 +gfx938,f8_w8a8_channel,torch.float16,384,256,7168,256,8,0,0,asm,11002+21001,903.3123 +gfx938,f8_w8a8_channel,torch.float16,448,256,7168,256,8,0,0,asm,11004+21001,923.768 +gfx938,f8_w8a8_channel,torch.float16,512,256,7168,256,8,0,0,asm,11004+21001,945.9661 +gfx938,f8_w8a8_channel,torch.float16,576,256,7168,256,8,0,0,asm,11004+21001,1004.9307 +gfx938,f8_w8a8_channel,torch.float16,640,256,7168,256,8,0,0,asm,11004+21001,984.5559 +gfx938,f8_w8a8_channel,torch.float16,704,256,7168,256,8,0,0,asm,11004+21001,1009.4000 +gfx938,f8_w8a8_channel,torch.float16,768,256,7168,256,8,0,0,asm,11007+21001,1039.9012 +gfx938,f8_w8a8_channel,torch.float16,832,256,7168,256,8,0,0,asm,11005+21001,1073.4592 +gfx938,f8_w8a8_channel,torch.float16,896,256,7168,256,8,0,0,asm,11005+21001,1100.2193 +gfx938,f8_w8a8_channel,torch.float16,960,256,7168,256,8,0,0,asm,11003+21001,1159.3657 +gfx938,f8_w8a8_channel,torch.float16,1024,256,7168,256,8,0,0,asm,11005+21001,1212.9271 +gfx938,f8_w8a8_channel,torch.float16,1152,256,7168,256,8,0,0,asm,12005+22001,1301.3286 +gfx938,f8_w8a8_channel,torch.float16,1280,256,7168,256,8,0,0,asm,12000+22001,1317.5435 +gfx938,f8_w8a8_channel,torch.float16,1408,256,7168,256,8,0,0,asm,12005+22001,1342.2792 +gfx938,f8_w8a8_channel,torch.float16,1536,256,7168,256,8,0,0,asm,12005+22001,1367.209 +gfx938,f8_w8a8_channel,torch.float16,1664,256,7168,256,8,0,0,asm,12005+22001,1409.4242 +gfx938,f8_w8a8_channel,torch.float16,1792,256,7168,256,8,0,0,asm,12005+22001,1489.8281 +gfx938,f8_w8a8_channel,torch.float16,1920,256,7168,256,8,0,0,asm,12005+22001,1528.3161 +gfx938,f8_w8a8_channel,torch.float16,2048,256,7168,256,8,0,0,asm,12001+22001,1688.9472 +gfx938,f8_w8a8_channel,torch.float16,2304,256,7168,256,8,0,0,asm,12005+22001,1951.8713 +gfx938,f8_w8a8_channel,torch.float16,2560,256,7168,256,8,0,0,asm,13001+23001,2031.2743 +gfx938,f8_w8a8_channel,torch.float16,2816,256,7168,256,8,0,0,asm,13001+23001,2085.2615 +gfx938,f8_w8a8_channel,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23001,2147.5381 +gfx938,f8_w8a8_channel,torch.float16,3328,256,7168,256,8,0,0,asm,13001+23001,2207.5995 +gfx938,f8_w8a8_channel,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23001,2290.0607 +gfx938,f8_w8a8_channel,torch.float16,3840,256,7168,256,8,0,0,asm,13001+23001,2455.1266 +gfx938,f8_w8a8_channel,torch.float16,4096,256,7168,256,8,0,0,asm,12005+22001,2762.0217 +gfx938,f8_w8a8_channel,torch.float16,4608,256,7168,256,8,0,0,asm,12001+22001,3148.4202 +gfx938,f8_w8a8_channel,torch.float16,5120,256,7168,256,8,0,0,asm,12001+22001,3345.6214 +gfx938,f8_w8a8_channel,torch.float16,5632,256,7168,256,8,0,0,asm,12001+22001,3531.3437 +gfx938,f8_w8a8_channel,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23001,3833.4307 +gfx938,f8_w8a8_channel,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23001,3933.061 +gfx938,f8_w8a8_channel,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23001,4047.1816 +gfx938,f8_w8a8_channel,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23001,4302.8155 +gfx938,f8_w8a8_channel,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23001,4770.8874 +gfx938,f8_w8a8_channel,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23001,5814.4894 +gfx938,f8_w8a8_channel,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23001,6783.9214 +gfx938,f8_w8a8_channel,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23001,7833.6239 +gfx938,f8_w8a8_channel,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23001,8799.1627 +gfx938,f8_w8a8_channel,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23001,9562.2695 +gfx938,f8_w8a8_channel,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23001,12888.3568 +gfx938,f8_w8a8_channel,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23001,16903.9514 +gfx938,f8_w8a8_channel,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23001,20904.6085 +gfx938,f8_w8a8_channel,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23001,24866.1601 +gfx938,f8_w8a8_channel,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23001,28986.4682 +gfx938,f8_w8a8_channel,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23001,33012.5482 +gfx938,f8_w8a8_channel,torch.float16,1,352,4096,128,8,0,0,asm,10001+20000,63.0496 +gfx938,f8_w8a8_channel,torch.float16,2,352,4096,128,8,0,0,asm,10001+20000,89.0958 +gfx938,f8_w8a8_channel,torch.float16,4,352,4096,128,8,0,0,asm,10006+20000,128.5736 +gfx938,f8_w8a8_channel,torch.float16,6,352,4096,128,8,0,0,asm,10006+20000,161.6681 +gfx938,f8_w8a8_channel,torch.float16,8,352,4096,128,8,0,0,asm,10007+20000,193.1626 +gfx938,f8_w8a8_channel,torch.float16,12,352,4096,128,8,0,0,asm,10006+20001,237.1877 +gfx938,f8_w8a8_channel,torch.float16,16,352,4096,128,8,0,0,asm,10006+20000,271.0993 +gfx938,f8_w8a8_channel,torch.float16,20,352,4096,128,8,0,0,asm,10007+20000,309.4708 +gfx938,f8_w8a8_channel,torch.float16,24,352,4096,128,8,0,0,asm,10007+20000,317.0666 +gfx938,f8_w8a8_channel,torch.float16,28,352,4096,128,8,0,0,asm,10006+20000,320.9012 +gfx938,f8_w8a8_channel,torch.float16,32,352,4096,128,8,0,0,asm,10007+20001,333.1537 +gfx938,f8_w8a8_channel,torch.float16,36,352,4096,128,8,0,0,asm,10006+20001,340.7128 +gfx938,f8_w8a8_channel,torch.float16,40,352,4096,128,8,0,0,asm,10006+20000,346.6581 +gfx938,f8_w8a8_channel,torch.float16,44,352,4096,128,8,0,0,asm,10006+20000,359.2222 +gfx938,f8_w8a8_channel,torch.float16,48,352,4096,128,8,0,0,asm,10006+20000,367.6264 +gfx938,f8_w8a8_channel,torch.float16,56,352,4096,128,8,0,0,asm,10006+20001,368.0895 +gfx938,f8_w8a8_channel,torch.float16,64,352,4096,128,8,0,0,asm,10006+20001,379.8314 +gfx938,f8_w8a8_channel,torch.float16,72,352,4096,128,8,0,0,asm,10006+20000,372.8558 +gfx938,f8_w8a8_channel,torch.float16,80,352,4096,128,8,0,0,asm,10006+20000,371.9043 +gfx938,f8_w8a8_channel,torch.float16,96,352,4096,128,8,0,0,asm,10006+20000,376.7547 +gfx938,f8_w8a8_channel,torch.float16,104,352,4096,128,8,0,0,asm,10006+20000,378.9442 +gfx938,f8_w8a8_channel,torch.float16,112,352,4096,128,8,0,0,asm,10006+20001,381.2263 +gfx938,f8_w8a8_channel,torch.float16,128,352,4096,128,8,0,0,asm,10006+20001,385.3105 +gfx938,f8_w8a8_channel,torch.float16,144,352,4096,128,8,0,0,asm,10006+20001,430.5903 +gfx938,f8_w8a8_channel,torch.float16,160,352,4096,128,8,0,0,asm,10006+20000,399.4326 +gfx938,f8_w8a8_channel,torch.float16,192,352,4096,128,8,0,0,asm,10006+20001,418.3883 +gfx938,f8_w8a8_channel,torch.float16,224,352,4096,128,8,0,0,asm,11002+21000,434.2787 +gfx938,f8_w8a8_channel,torch.float16,256,352,4096,128,8,0,0,asm,11004+21000,446.9103 +gfx938,f8_w8a8_channel,torch.float16,320,352,4096,128,8,0,0,asm,11004+21001,460.9228 +gfx938,f8_w8a8_channel,torch.float16,384,352,4096,128,8,0,0,asm,11004+21001,529.2931 +gfx938,f8_w8a8_channel,torch.float16,448,352,4096,128,8,0,0,asm,11004+21001,508.83 +gfx938,f8_w8a8_channel,torch.float16,512,352,4096,128,8,0,0,asm,11004+21001,554.5308 +gfx938,f8_w8a8_channel,torch.float16,768,352,4096,128,8,0,0,asm,12002+22001,625.1664 +gfx938,f8_w8a8_channel,torch.float16,896,352,4096,128,8,0,0,asm,12004+22001,665.6967 +gfx938,f8_w8a8_channel,torch.float16,960,352,4096,128,8,0,0,asm,12000+22001,710.7071 +gfx938,f8_w8a8_channel,torch.float16,1024,352,4096,128,8,0,0,asm,12005+22001,746.8753 +gfx938,f8_w8a8_channel,torch.float16,1280,352,4096,128,8,0,0,asm,12001+22001,861.7717 +gfx938,f8_w8a8_channel,torch.float16,1536,352,4096,128,8,0,0,asm,12001+22001,918.0662 +gfx938,f8_w8a8_channel,torch.float16,1920,352,4096,128,8,0,0,asm,12001+22001,995.969 +gfx938,f8_w8a8_channel,torch.float16,2048,352,4096,128,8,0,0,asm,12001+22001,1101.2402 +gfx938,f8_w8a8_channel,torch.float16,2304,352,4096,128,8,0,0,asm,12001+22001,1248.389 +gfx938,f8_w8a8_channel,torch.float16,2560,352,4096,128,8,0,0,asm,12001+22001,1272.6331 +gfx938,f8_w8a8_channel,torch.float16,3072,352,4096,128,8,0,0,asm,13001+23001,1460.7586 +gfx938,f8_w8a8_channel,torch.float16,3584,352,4096,128,8,0,0,asm,13001+23001,1532.7836 +gfx938,f8_w8a8_channel,torch.float16,3840,352,4096,128,8,0,0,asm,13001+23001,1618.2737 +gfx938,f8_w8a8_channel,torch.float16,4096,352,4096,128,8,0,0,asm,13001+23001,1813.9613 +gfx938,f8_w8a8_channel,torch.float16,4608,352,4096,128,8,0,0,asm,13001+23001,2081.9939 +gfx938,f8_w8a8_channel,torch.float16,5120,352,4096,128,8,0,0,asm,13001+23001,2147.0968 +gfx938,f8_w8a8_channel,torch.float16,6144,352,4096,128,8,0,0,asm,13001+23001,2515.8026 +gfx938,f8_w8a8_channel,torch.float16,7168,352,4096,128,8,0,0,asm,13001+23001,2873.9653 +gfx938,f8_w8a8_channel,torch.float16,8192,352,4096,128,8,0,0,asm,13001+23001,3216.9279 +gfx938,f8_w8a8_channel,torch.float16,10240,352,4096,128,8,0,0,asm,13001+23001,3971.7204 +gfx938,f8_w8a8_channel,torch.float16,12288,352,4096,128,8,0,0,asm,13001+23001,4688.2562 +gfx938,f8_w8a8_channel,torch.float16,16384,352,4096,128,8,0,0,asm,13001+23001,6226.4982 +gfx938,f8_w8a8_channel,torch.float16,24576,352,4096,128,8,0,0,asm,13001+23001,8953.0047 +gfx938,f8_w8a8_channel,torch.float16,32768,352,4096,128,8,0,0,asm,13001+23001,11870.9884 +gfx938,f8_w8a8_channel,torch.float16,1,352,4096,129,9,0,0,asm,10004+20000,66.9906 +gfx938,f8_w8a8_channel,torch.float16,2,352,4096,129,9,0,0,asm,10006+20000,93.7946 +gfx938,f8_w8a8_channel,torch.float16,4,352,4096,129,9,0,0,asm,10006+20000,135.1756 +gfx938,f8_w8a8_channel,torch.float16,6,352,4096,129,9,0,0,asm,10006+20000,169.0953 +gfx938,f8_w8a8_channel,torch.float16,8,352,4096,129,9,0,0,asm,10006+20000,206.291 +gfx938,f8_w8a8_channel,torch.float16,12,352,4096,129,9,0,0,asm,10006+20000,247.0656 +gfx938,f8_w8a8_channel,torch.float16,16,352,4096,129,9,0,0,asm,10006+20000,289.8192 +gfx938,f8_w8a8_channel,torch.float16,20,352,4096,129,9,0,0,asm,10007+20000,319.9213 +gfx938,f8_w8a8_channel,torch.float16,24,352,4096,129,9,0,0,asm,10006+20000,338.0181 +gfx938,f8_w8a8_channel,torch.float16,28,352,4096,129,9,0,0,asm,10006+20000,339.0821 +gfx938,f8_w8a8_channel,torch.float16,32,352,4096,129,9,0,0,asm,10006+20000,344.3705 +gfx938,f8_w8a8_channel,torch.float16,36,352,4096,129,9,0,0,asm,10006+20000,349.698 +gfx938,f8_w8a8_channel,torch.float16,40,352,4096,129,9,0,0,asm,10006+20000,356.6116 +gfx938,f8_w8a8_channel,torch.float16,44,352,4096,129,9,0,0,asm,10006+20000,364.4264 +gfx938,f8_w8a8_channel,torch.float16,48,352,4096,129,9,0,0,asm,10006+20000,369.5632 +gfx938,f8_w8a8_channel,torch.float16,56,352,4096,129,9,0,0,asm,10006+20000,374.7421 +gfx938,f8_w8a8_channel,torch.float16,64,352,4096,129,9,0,0,asm,10006+20000,383.0145 +gfx938,f8_w8a8_channel,torch.float16,72,352,4096,129,9,0,0,asm,10006+20000,381.8074 +gfx938,f8_w8a8_channel,torch.float16,80,352,4096,129,9,0,0,asm,10007+20000,377.8916 +gfx938,f8_w8a8_channel,torch.float16,96,352,4096,129,9,0,0,asm,10006+20000,385.0915 +gfx938,f8_w8a8_channel,torch.float16,104,352,4096,129,9,0,0,asm,10006+20000,384.6873 +gfx938,f8_w8a8_channel,torch.float16,112,352,4096,129,9,0,0,asm,10006+20000,387.6684 +gfx938,f8_w8a8_channel,torch.float16,128,352,4096,129,9,0,0,asm,10006+20001,389.6473 +gfx938,f8_w8a8_channel,torch.float16,144,352,4096,129,9,0,0,asm,10006+20001,403.0031 +gfx938,f8_w8a8_channel,torch.float16,160,352,4096,129,9,0,0,asm,10006+20001,404.7041 +gfx938,f8_w8a8_channel,torch.float16,192,352,4096,129,9,0,0,asm,11004+21000,428.0303 +gfx938,f8_w8a8_channel,torch.float16,224,352,4096,129,9,0,0,asm,11004+21001,438.6576 +gfx938,f8_w8a8_channel,torch.float16,256,352,4096,129,9,0,0,asm,11004+21000,446.2871 +gfx938,f8_w8a8_channel,torch.float16,320,352,4096,129,9,0,0,asm,11004+21001,471.8112 +gfx938,f8_w8a8_channel,torch.float16,384,352,4096,129,9,0,0,asm,11004+21000,494.91 +gfx938,f8_w8a8_channel,torch.float16,448,352,4096,129,9,0,0,asm,11004+21001,535.9625 +gfx938,f8_w8a8_channel,torch.float16,512,352,4096,129,9,0,0,asm,12000+22001,566.0424 +gfx938,f8_w8a8_channel,torch.float16,768,352,4096,129,9,0,0,asm,12004+22001,633.3263 +gfx938,f8_w8a8_channel,torch.float16,896,352,4096,129,9,0,0,asm,12001+22001,714.4207 +gfx938,f8_w8a8_channel,torch.float16,960,352,4096,129,9,0,0,asm,12003+22001,785.0646 +gfx938,f8_w8a8_channel,torch.float16,1024,352,4096,129,9,0,0,asm,12001+22001,842.2433 +gfx938,f8_w8a8_channel,torch.float16,1280,352,4096,129,9,0,0,asm,12001+22001,895.9105 +gfx938,f8_w8a8_channel,torch.float16,1536,352,4096,129,9,0,0,asm,12005+22001,955.506 +gfx938,f8_w8a8_channel,torch.float16,1920,352,4096,129,9,0,0,asm,12001+22001,1193.5008 +gfx938,f8_w8a8_channel,torch.float16,2048,352,4096,129,9,0,0,asm,12001+22001,1239.749 +gfx938,f8_w8a8_channel,torch.float16,2304,352,4096,129,9,0,0,asm,12001+22001,1304.8266 +gfx938,f8_w8a8_channel,torch.float16,2560,352,4096,129,9,0,0,asm,12001+22001,1346.1569 +gfx938,f8_w8a8_channel,torch.float16,3072,352,4096,129,9,0,0,asm,13001+23001,1491.9416 +gfx938,f8_w8a8_channel,torch.float16,3584,352,4096,129,9,0,0,asm,13001+23001,1740.909 +gfx938,f8_w8a8_channel,torch.float16,3840,352,4096,129,9,0,0,asm,13001+23001,1995.6785 +gfx938,f8_w8a8_channel,torch.float16,4096,352,4096,129,9,0,0,asm,13001+23001,2067.636 +gfx938,f8_w8a8_channel,torch.float16,4608,352,4096,129,9,0,0,asm,13001+23001,2126.1537 +gfx938,f8_w8a8_channel,torch.float16,5120,352,4096,129,9,0,0,asm,13001+23001,2344.7044 +gfx938,f8_w8a8_channel,torch.float16,6144,352,4096,129,9,0,0,asm,13001+23001,2914.0914 +gfx938,f8_w8a8_channel,torch.float16,7168,352,4096,129,9,0,0,asm,13001+23001,3109.3158 +gfx938,f8_w8a8_channel,torch.float16,8192,352,4096,129,9,0,0,asm,13001+23001,3587.9411 +gfx938,f8_w8a8_channel,torch.float16,10240,352,4096,129,9,0,0,asm,13001+23001,4377.0662 +gfx938,f8_w8a8_channel,torch.float16,12288,352,4096,129,9,0,0,asm,13001+23001,5110.4779 +gfx938,f8_w8a8_channel,torch.float16,16384,352,4096,129,9,0,0,asm,13001+23001,6757.208 +gfx938,f8_w8a8_channel,torch.float16,24576,352,4096,129,9,0,0,asm,13001+23001,9975.1734 +gfx938,f8_w8a8_channel,torch.float16,32768,352,4096,129,9,0,0,asm,13001+23001,13179.6065 +gfx936,int8_w8a8_channel,torch.float16,1,384,3072,256,8,0,0,asm,10002+20000,57.8034 +gfx936,int8_w8a8_channel,torch.float16,2,384,3072,256,8,0,0,asm,10006+20000,79.9676 +gfx936,int8_w8a8_channel,torch.float16,4,384,3072,256,8,0,0,asm,10007+20000,127.3875 +gfx936,int8_w8a8_channel,torch.float16,6,384,3072,256,8,0,0,asm,10007+20000,167.0003 +gfx936,int8_w8a8_channel,torch.float16,8,384,3072,256,8,0,0,asm,10007+20000,202.6299 +gfx936,int8_w8a8_channel,torch.float16,12,384,3072,256,8,0,0,asm,10007+20000,264.5336 +gfx936,int8_w8a8_channel,torch.float16,16,384,3072,256,8,0,0,asm,10007+20000,316.8877 +gfx936,int8_w8a8_channel,torch.float16,24,384,3072,256,8,0,0,asm,10007+20000,415.3612 +gfx936,int8_w8a8_channel,torch.float16,32,384,3072,256,8,0,0,asm,10007+20000,503.9710 +gfx936,int8_w8a8_channel,torch.float16,36,384,3072,256,8,0,0,asm,10007+20000,523.2945 +gfx936,int8_w8a8_channel,torch.float16,48,384,3072,256,8,0,0,asm,10007+20000,584.4568 +gfx936,int8_w8a8_channel,torch.float16,56,384,3072,256,8,0,0,asm,10007+20000,615.8508 +gfx936,int8_w8a8_channel,torch.float16,64,384,3072,256,8,0,0,asm,10006+20000,640.557 +gfx936,int8_w8a8_channel,torch.float16,72,384,3072,256,8,0,0,asm,10006+20000,664.3161 +gfx936,int8_w8a8_channel,torch.float16,80,384,3072,256,8,0,0,asm,10006+20000,687.6761 +gfx936,int8_w8a8_channel,torch.float16,88,384,3072,256,8,0,0,asm,10007+20000,708.2149 +gfx936,int8_w8a8_channel,torch.float16,96,384,3072,256,8,0,0,asm,10006+20000,716.4591 +gfx936,int8_w8a8_channel,torch.float16,100,384,3072,256,8,0,0,asm,10007+20000,721.7644 +gfx936,int8_w8a8_channel,torch.float16,112,384,3072,256,8,0,0,asm,10007+20000,734.3622 +gfx936,int8_w8a8_channel,torch.float16,128,384,3072,256,8,0,0,asm,10007+20000,751.3221 +gfx936,int8_w8a8_channel,torch.float16,144,384,3072,256,8,0,0,asm,10007+20000,759.3304 +gfx936,int8_w8a8_channel,torch.float16,160,384,3072,256,8,0,0,asm,10006+20000,767.7599 +gfx936,int8_w8a8_channel,torch.float16,192,384,3072,256,8,0,0,asm,10007+20000,782.0504 +gfx936,int8_w8a8_channel,torch.float16,224,384,3072,256,8,0,0,asm,10006+20000,795.6587 +gfx936,int8_w8a8_channel,torch.float16,256,384,3072,256,8,0,0,asm,10006+20000,804.6354 +gfx936,int8_w8a8_channel,torch.float16,320,384,3072,256,8,0,0,asm,10007+20000,830.3447 +gfx936,int8_w8a8_channel,torch.float16,384,384,3072,256,8,0,0,asm,11006+21000,854.2437 +gfx936,int8_w8a8_channel,torch.float16,448,384,3072,256,8,0,0,asm,11006+21000,872.6016 +gfx936,int8_w8a8_channel,torch.float16,512,384,3072,256,8,0,0,asm,11006+21000,887.7257 +gfx936,int8_w8a8_channel,torch.float16,640,384,3072,256,8,0,0,asm,11006+21000,921.6457 +gfx936,int8_w8a8_channel,torch.float16,768,384,3072,256,8,0,0,asm,11006+21000,952.2567 +gfx936,int8_w8a8_channel,torch.float16,896,384,3072,256,8,0,0,asm,12004+22001,1000.5934 +gfx936,int8_w8a8_channel,torch.float16,1024,384,3072,256,8,0,0,asm,12004+22001,1024.0714 +gfx936,int8_w8a8_channel,torch.float16,1280,384,3072,256,8,0,0,asm,12004+22001,1071.4648 +gfx936,int8_w8a8_channel,torch.float16,1536,384,3072,256,8,0,0,asm,12004+22001,1123.8351 +gfx936,int8_w8a8_channel,torch.float16,2048,384,3072,256,8,0,0,asm,12005+22001,1284.3736 +gfx936,int8_w8a8_channel,torch.float16,2304,384,3072,256,8,0,0,asm,12005+22001,1379.2871 +gfx936,int8_w8a8_channel,torch.float16,2560,384,3072,256,8,0,0,asm,13000+23001,1425.6448 +gfx936,int8_w8a8_channel,torch.float16,3072,384,3072,256,8,0,0,asm,13000+23001,1514.0309 +gfx936,int8_w8a8_channel,torch.float16,3584,384,3072,256,8,0,0,asm,13000+23001,1634.6032 +gfx936,int8_w8a8_channel,torch.float16,4096,384,3072,256,8,0,0,asm,13001+23001,1824.514 +gfx936,int8_w8a8_channel,torch.float16,5120,384,3072,256,8,0,0,asm,12001+22001,2216.8495 +gfx936,int8_w8a8_channel,torch.float16,6144,384,3072,256,8,0,0,asm,13001+23001,2364.4616 +gfx936,int8_w8a8_channel,torch.float16,7168,384,3072,256,8,0,0,asm,13001+23001,2538.6083 +gfx936,int8_w8a8_channel,torch.float16,8192,384,3072,256,8,0,0,asm,13001+23001,2952.9812 +gfx936,int8_w8a8_channel,torch.float16,10240,384,3072,256,8,0,0,asm,13001+23001,3585.1639 +gfx936,int8_w8a8_channel,torch.float16,12288,384,3072,256,8,0,0,asm,13001+23001,4181.1698 +gfx936,int8_w8a8_channel,torch.float16,16384,384,3072,256,8,0,0,asm,13001+23001,5377.322 +gfx936,int8_w8a8_channel,torch.float16,24576,384,3072,256,8,0,0,asm,13001+23001,7906.7003 +gfx936,int8_w8a8_channel,torch.float16,32768,384,3072,256,8,0,0,asm,13001+23001,10408.7744 +gfx936,int8_w8a8_channel,torch.float16,1,192,3072,256,8,0,0,asm,10006+20000,44.8855 +gfx936,int8_w8a8_channel,torch.float16,2,192,3072,256,8,0,0,asm,10009+20000,60.4898 +gfx936,int8_w8a8_channel,torch.float16,4,192,3072,256,8,0,0,asm,10013+20000,83.9172 +gfx936,int8_w8a8_channel,torch.float16,6,192,3072,256,8,0,0,asm,10006+20000,104.9866 +gfx936,int8_w8a8_channel,torch.float16,8,192,3072,256,8,0,0,asm,10009+20001,125.0032 +gfx936,int8_w8a8_channel,torch.float16,12,192,3072,256,8,0,0,asm,10010+20000,160.2622 +gfx936,int8_w8a8_channel,torch.float16,16,192,3072,256,8,0,0,asm,10006+20000,188.6327 +gfx936,int8_w8a8_channel,torch.float16,24,192,3072,256,8,0,0,asm,10007+20001,236.7812 +gfx936,int8_w8a8_channel,torch.float16,32,192,3072,256,8,0,0,asm,10007+20001,292.8094 +gfx936,int8_w8a8_channel,torch.float16,36,192,3072,256,8,0,0,asm,10007+20001,296.5958 +gfx936,int8_w8a8_channel,torch.float16,48,192,3072,256,8,0,0,asm,10007+20000,328.3769 +gfx936,int8_w8a8_channel,torch.float16,56,192,3072,256,8,0,0,asm,10007+20001,343.5516 +gfx936,int8_w8a8_channel,torch.float16,64,192,3072,256,8,0,0,asm,10007+20001,356.6294 +gfx936,int8_w8a8_channel,torch.float16,72,192,3072,256,8,0,0,asm,10007+20001,369.8757 +gfx936,int8_w8a8_channel,torch.float16,80,192,3072,256,8,0,0,asm,10006+20001,377.9768 +gfx936,int8_w8a8_channel,torch.float16,88,192,3072,256,8,0,0,asm,10006+20001,389.4547 +gfx936,int8_w8a8_channel,torch.float16,96,192,3072,256,8,0,0,asm,10006+20000,396.4441 +gfx936,int8_w8a8_channel,torch.float16,100,192,3072,256,8,0,0,asm,10007+20000,395.9387 +gfx936,int8_w8a8_channel,torch.float16,112,192,3072,256,8,0,0,asm,10007+20001,404.8062 +gfx936,int8_w8a8_channel,torch.float16,128,192,3072,256,8,0,0,asm,10006+20000,418.9955 +gfx936,int8_w8a8_channel,torch.float16,144,192,3072,256,8,0,0,asm,10006+20001,422.4733 +gfx936,int8_w8a8_channel,torch.float16,160,192,3072,256,8,0,0,asm,10006+20001,426.6501 +gfx936,int8_w8a8_channel,torch.float16,192,192,3072,256,8,0,0,asm,10006+20001,435.0712 +gfx936,int8_w8a8_channel,torch.float16,224,192,3072,256,8,0,0,asm,10006+20001,441.6734 +gfx936,int8_w8a8_channel,torch.float16,256,192,3072,256,8,0,0,asm,10006+20001,449.2776 +gfx936,int8_w8a8_channel,torch.float16,320,192,3072,256,8,0,0,asm,10007+20001,465.3449 +gfx936,int8_w8a8_channel,torch.float16,384,192,3072,256,8,0,0,asm,10007+20001,483.1134 +gfx936,int8_w8a8_channel,torch.float16,448,192,3072,256,8,0,0,asm,11006+21001,496.2418 +gfx936,int8_w8a8_channel,torch.float16,512,192,3072,256,8,0,0,asm,11006+21001,511.8880 +gfx936,int8_w8a8_channel,torch.float16,640,192,3072,256,8,0,0,asm,11006+21001,536.2923 +gfx936,int8_w8a8_channel,torch.float16,768,192,3072,256,8,0,0,asm,11006+21001,565.488 +gfx936,int8_w8a8_channel,torch.float16,896,192,3072,256,8,0,0,asm,11004+21001,592.8648 +gfx936,int8_w8a8_channel,torch.float16,1024,192,3072,256,8,0,0,asm,11004+21001,632.0227 +gfx936,int8_w8a8_channel,torch.float16,1280,192,3072,256,8,0,0,asm,12004+22001,678.2626 +gfx936,int8_w8a8_channel,torch.float16,1536,192,3072,256,8,0,0,asm,12004+22001,721.7235 +gfx936,int8_w8a8_channel,torch.float16,2048,192,3072,256,8,0,0,asm,12000+22001,866.7338 +gfx936,int8_w8a8_channel,torch.float16,2304,192,3072,256,8,0,0,asm,13001+23001,968.9905 +gfx936,int8_w8a8_channel,torch.float16,2560,192,3072,256,8,0,0,asm,13001+23001,1005.2935 +gfx936,int8_w8a8_channel,torch.float16,3072,192,3072,256,8,0,0,asm,13000+23001,1075.8535 +gfx936,int8_w8a8_channel,torch.float16,3584,192,3072,256,8,0,0,asm,13001+23001,1161.3776 +gfx936,int8_w8a8_channel,torch.float16,4096,192,3072,256,8,0,0,asm,13001+23001,1384.6193 +gfx936,int8_w8a8_channel,torch.float16,5120,192,3072,256,8,0,0,asm,12000+22001,1677.0231 +gfx936,int8_w8a8_channel,torch.float16,6144,192,3072,256,8,0,0,asm,13001+23001,1893.4521 +gfx936,int8_w8a8_channel,torch.float16,7168,192,3072,256,8,0,0,asm,13001+23001,2042.8661 +gfx936,int8_w8a8_channel,torch.float16,8192,192,3072,256,8,0,0,asm,13001+23001,2372.4733 +gfx936,int8_w8a8_channel,torch.float16,10240,192,3072,256,8,0,0,asm,13001+23001,2884.8177 +gfx936,int8_w8a8_channel,torch.float16,12288,192,3072,256,8,0,0,asm,13001+23001,3381.5416 +gfx936,int8_w8a8_channel,torch.float16,16384,192,3072,256,8,0,0,asm,13001+23001,4384.9761 +gfx936,int8_w8a8_channel,torch.float16,24576,192,3072,256,8,0,0,asm,13001+23001,6428.7541 +gfx936,int8_w8a8_channel,torch.float16,32768,192,3072,256,8,0,0,asm,13001+23001,8504.9194 +gfx936,int8_w8a8_channel,torch.float16,1,192,5120,160,8,0,0,asm,10006+20000,52.2624 +gfx936,int8_w8a8_channel,torch.float16,2,192,5120,160,8,0,0,asm,10009+20000,70.2834 +gfx936,int8_w8a8_channel,torch.float16,4,192,5120,160,8,0,0,asm,10002+20000,103.8327 +gfx936,int8_w8a8_channel,torch.float16,6,192,5120,160,8,0,0,asm,10007+20000,139.0834 +gfx936,int8_w8a8_channel,torch.float16,8,192,5120,160,8,0,0,asm,10007+20000,171.4032 +gfx936,int8_w8a8_channel,torch.float16,12,192,5120,160,8,0,0,asm,10007+20001,233.6516 +gfx936,int8_w8a8_channel,torch.float16,16,192,5120,160,8,0,0,asm,10007+20000,268.1862 +gfx936,int8_w8a8_channel,torch.float16,24,192,5120,160,8,0,0,asm,10007+20001,321.5474 +gfx936,int8_w8a8_channel,torch.float16,32,192,5120,160,8,0,0,asm,10007+20001,362.0808 +gfx936,int8_w8a8_channel,torch.float16,36,192,5120,160,8,0,0,asm,10006+20001,372.5115 +gfx936,int8_w8a8_channel,torch.float16,48,192,5120,160,8,0,0,asm,10007+20000,397.1009 +gfx936,int8_w8a8_channel,torch.float16,56,192,5120,160,8,0,0,asm,10007+20000,410.7262 +gfx936,int8_w8a8_channel,torch.float16,64,192,5120,160,8,0,0,asm,10007+20000,416.1324 +gfx936,int8_w8a8_channel,torch.float16,72,192,5120,160,8,0,0,asm,10007+20001,423.3745 +gfx936,int8_w8a8_channel,torch.float16,80,192,5120,160,8,0,0,asm,10007+20000,430.4229 +gfx936,int8_w8a8_channel,torch.float16,88,192,5120,160,8,0,0,asm,10007+20001,432.8483 +gfx936,int8_w8a8_channel,torch.float16,96,192,5120,160,8,0,0,asm,10006+20001,437.564 +gfx936,int8_w8a8_channel,torch.float16,100,192,5120,160,8,0,0,asm,10001+20001,439.9303 +gfx936,int8_w8a8_channel,torch.float16,112,192,5120,160,8,0,0,asm,10006+20001,443.0629 +gfx936,int8_w8a8_channel,torch.float16,128,192,5120,160,8,0,0,asm,10006+20001,446.1113 +gfx936,int8_w8a8_channel,torch.float16,144,192,5120,160,8,0,0,asm,10007+20001,452.2587 +gfx936,int8_w8a8_channel,torch.float16,160,192,5120,160,8,0,0,asm,10006+20001,457.2608 +gfx936,int8_w8a8_channel,torch.float16,192,192,5120,160,8,0,0,asm,10006+20001,466.7092 +gfx936,int8_w8a8_channel,torch.float16,224,192,5120,160,8,0,0,asm,10006+20001,476.2418 +gfx936,int8_w8a8_channel,torch.float16,256,192,5120,160,8,0,0,asm,10006+20001,490.3724 +gfx936,int8_w8a8_channel,torch.float16,320,192,5120,160,8,0,0,asm,11006+21001,506.7597 +gfx936,int8_w8a8_channel,torch.float16,384,192,5120,160,8,0,0,asm,11006+21001,525.1344 +gfx936,int8_w8a8_channel,torch.float16,448,192,5120,160,8,0,0,asm,11006+21001,540.646 +gfx936,int8_w8a8_channel,torch.float16,512,192,5120,160,8,0,0,asm,11006+21001,575.2648 +gfx936,int8_w8a8_channel,torch.float16,640,192,5120,160,8,0,0,asm,12004+22001,626.5237 +gfx936,int8_w8a8_channel,torch.float16,768,192,5120,160,8,0,0,asm,12004+22001,656.4268 +gfx936,int8_w8a8_channel,torch.float16,896,192,5120,160,8,0,0,asm,12004+22001,681.1089 +gfx936,int8_w8a8_channel,torch.float16,1024,192,5120,160,8,0,0,asm,12004+22001,722.4309 +gfx936,int8_w8a8_channel,torch.float16,1280,192,5120,160,8,0,0,asm,12004+22001,840.5107 +gfx936,int8_w8a8_channel,torch.float16,1536,192,5120,160,8,0,0,asm,13001+23001,962.6073 +gfx936,int8_w8a8_channel,torch.float16,2048,192,5120,160,8,0,0,asm,13001+23001,1058.4304 +gfx936,int8_w8a8_channel,torch.float16,2304,192,5120,160,8,0,0,asm,13001+23001,1153.0744 +gfx936,int8_w8a8_channel,torch.float16,2560,192,5120,160,8,0,0,asm,12001+22001,1339.7014 +gfx936,int8_w8a8_channel,torch.float16,3072,192,5120,160,8,0,0,asm,12001+22001,1563.2211 +gfx936,int8_w8a8_channel,torch.float16,3584,192,5120,160,8,0,0,asm,12001+22001,1709.4608 +gfx936,int8_w8a8_channel,torch.float16,4096,192,5120,160,8,0,0,asm,13001+23001,1842.4543 +gfx936,int8_w8a8_channel,torch.float16,5120,192,5120,160,8,0,0,asm,13001+23001,2276.4578 +gfx936,int8_w8a8_channel,torch.float16,6144,192,5120,160,8,0,0,asm,13001+23001,2694.0318 +gfx936,int8_w8a8_channel,torch.float16,7168,192,5120,160,8,0,0,asm,13001+23001,2929.3831 +gfx936,int8_w8a8_channel,torch.float16,8192,192,5120,160,8,0,0,asm,13001+23001,3537.1883 +gfx936,int8_w8a8_channel,torch.float16,10240,192,5120,160,8,0,0,asm,13001+23001,4197.0934 +gfx936,int8_w8a8_channel,torch.float16,12288,192,5120,160,8,0,0,asm,13001+23001,4879.3646 +gfx936,int8_w8a8_channel,torch.float16,16384,192,5120,160,8,0,0,asm,13001+23001,6534.0736 +gfx936,int8_w8a8_channel,torch.float16,24576,192,5120,160,8,0,0,asm,13001+23001,9540.3264 +gfx936,int8_w8a8_channel,torch.float16,32768,192,5120,160,8,0,0,asm,13001+23001,12666.0375 +gfx936,int8_w8a8_channel,torch.float16,1,96,5120,160,8,0,0,asm,10001+20000,42.0897 +gfx936,int8_w8a8_channel,torch.float16,2,96,5120,160,8,0,0,asm,10006+20001,51.6561 +gfx936,int8_w8a8_channel,torch.float16,4,96,5120,160,8,0,0,asm,10010+20001,71.9255 +gfx936,int8_w8a8_channel,torch.float16,6,96,5120,160,8,0,0,asm,10011+20001,89.837 +gfx936,int8_w8a8_channel,torch.float16,8,96,5120,160,8,0,0,asm,10013+20001,105.9211 +gfx936,int8_w8a8_channel,torch.float16,12,96,5120,160,8,0,0,asm,10006+20001,143.3947 +gfx936,int8_w8a8_channel,torch.float16,16,96,5120,160,8,0,0,asm,10006+20001,162.0727 +gfx936,int8_w8a8_channel,torch.float16,24,96,5120,160,8,0,0,asm,10006+20001,189.3033 +gfx936,int8_w8a8_channel,torch.float16,32,96,5120,160,8,0,0,asm,10007+20001,215.8915 +gfx936,int8_w8a8_channel,torch.float16,36,96,5120,160,8,0,0,asm,10006+20001,217.2443 +gfx936,int8_w8a8_channel,torch.float16,48,96,5120,160,8,0,0,asm,10006+20001,230.3644 +gfx936,int8_w8a8_channel,torch.float16,56,96,5120,160,8,0,0,asm,10008+20001,241.438 +gfx936,int8_w8a8_channel,torch.float16,64,96,5120,160,8,0,0,asm,10008+20001,246.0443 +gfx936,int8_w8a8_channel,torch.float16,72,96,5120,160,8,0,0,asm,10007+20001,248.8653 +gfx936,int8_w8a8_channel,torch.float16,80,96,5120,160,8,0,0,asm,10007+20001,252.1325 +gfx936,int8_w8a8_channel,torch.float16,88,96,5120,160,8,0,0,asm,10007+20001,253.3452 +gfx936,int8_w8a8_channel,torch.float16,96,96,5120,160,8,0,0,asm,10007+20001,254.6589 +gfx936,int8_w8a8_channel,torch.float16,100,96,5120,160,8,0,0,asm,10007+20001,258.5915 +gfx936,int8_w8a8_channel,torch.float16,112,96,5120,160,8,0,0,asm,10010+20001,261.1095 +gfx936,int8_w8a8_channel,torch.float16,128,96,5120,160,8,0,0,asm,10008+20001,265.8504 +gfx936,int8_w8a8_channel,torch.float16,144,96,5120,160,8,0,0,asm,10006+20001,273.5389 +gfx936,int8_w8a8_channel,torch.float16,160,96,5120,160,8,0,0,asm,10006+20001,276.6042 +gfx936,int8_w8a8_channel,torch.float16,192,96,5120,160,8,0,0,asm,10006+20001,282.76 +gfx936,int8_w8a8_channel,torch.float16,224,96,5120,160,8,0,0,asm,10006+20001,293.7663 +gfx936,int8_w8a8_channel,torch.float16,256,96,5120,160,8,0,0,asm,10013+20001,302.4736 +gfx936,int8_w8a8_channel,torch.float16,320,96,5120,160,8,0,0,asm,11004+21001,321.4126 +gfx936,int8_w8a8_channel,torch.float16,384,96,5120,160,8,0,0,asm,11007+21001,337.4463 +gfx936,int8_w8a8_channel,torch.float16,448,96,5120,160,8,0,0,asm,11007+21001,347.8715 +gfx936,int8_w8a8_channel,torch.float16,512,96,5120,160,8,0,0,asm,11005+21001,372.8314 +gfx936,int8_w8a8_channel,torch.float16,640,96,5120,160,8,0,0,asm,11005+21001,421.7914 +gfx936,int8_w8a8_channel,torch.float16,768,96,5120,160,8,0,0,asm,12004+22001,469.1428 +gfx936,int8_w8a8_channel,torch.float16,896,96,5120,160,8,0,0,asm,12005+22001,493.4038 +gfx936,int8_w8a8_channel,torch.float16,1024,96,5120,160,8,0,0,asm,12005+22001,519.3238 +gfx936,int8_w8a8_channel,torch.float16,1280,96,5120,160,8,0,0,asm,12005+22001,622.8689 +gfx936,int8_w8a8_channel,torch.float16,1536,96,5120,160,8,0,0,asm,12001+22001,744.5023 +gfx936,int8_w8a8_channel,torch.float16,2048,96,5120,160,8,0,0,asm,13001+23001,832.9569 +gfx936,int8_w8a8_channel,torch.float16,2304,96,5120,160,8,0,0,asm,13001+23001,898.0432 +gfx936,int8_w8a8_channel,torch.float16,2560,96,5120,160,8,0,0,asm,12001+22001,1039.8702 +gfx936,int8_w8a8_channel,torch.float16,3072,96,5120,160,8,0,0,asm,12005+22001,1214.0004 +gfx936,int8_w8a8_channel,torch.float16,3584,96,5120,160,8,0,0,asm,12005+22001,1325.1582 +gfx936,int8_w8a8_channel,torch.float16,4096,96,5120,160,8,0,0,asm,13001+23001,1482.2526 +gfx936,int8_w8a8_channel,torch.float16,5120,96,5120,160,8,0,0,asm,13001+23001,1854.3025 +gfx936,int8_w8a8_channel,torch.float16,6144,96,5120,160,8,0,0,asm,13001+23001,2196.5587 +gfx936,int8_w8a8_channel,torch.float16,7168,96,5120,160,8,0,0,asm,13001+23001,2383.1689 +gfx936,int8_w8a8_channel,torch.float16,8192,96,5120,160,8,0,0,asm,13001+23001,2855.7492 +gfx936,int8_w8a8_channel,torch.float16,10240,96,5120,160,8,0,0,asm,13001+23001,3437.3146 +gfx936,int8_w8a8_channel,torch.float16,12288,96,5120,160,8,0,0,asm,13001+23001,3987.7221 +gfx936,int8_w8a8_channel,torch.float16,16384,96,5120,160,8,0,0,asm,13001+23001,5349.7577 +gfx936,int8_w8a8_channel,torch.float16,24576,96,5120,160,8,0,0,asm,13001+23001,7825.0634 +gfx936,int8_w8a8_channel,torch.float16,32768,96,5120,160,8,0,0,asm,13001+23001,10289.2112 +gfx936,int8_w8a8_channel,torch.float16,1,192,4096,128,8,0,0,asm,10001+20000,49.4919 +gfx936,int8_w8a8_channel,torch.float16,2,192,4096,128,8,0,0,asm,10003+20000,66.0476 +gfx936,int8_w8a8_channel,torch.float16,4,192,4096,128,8,0,0,asm,10001+20000,94.2496 +gfx936,int8_w8a8_channel,torch.float16,6,192,4096,128,8,0,0,asm,10004+20001,120.3632 +gfx936,int8_w8a8_channel,torch.float16,8,192,4096,128,8,0,0,asm,10001+20000,143.3105 +gfx936,int8_w8a8_channel,torch.float16,12,192,4096,128,8,0,0,asm,10001+20000,180.2621 +gfx936,int8_w8a8_channel,torch.float16,16,192,4096,128,8,0,0,asm,10004+20000,208.0346 +gfx936,int8_w8a8_channel,torch.float16,24,192,4096,128,8,0,0,asm,10001+20000,239.1557 +gfx936,int8_w8a8_channel,torch.float16,32,192,4096,128,8,0,0,asm,10001+20001,259.3861 +gfx936,int8_w8a8_channel,torch.float16,36,192,4096,128,8,0,0,asm,10001+20000,264.5032 +gfx936,int8_w8a8_channel,torch.float16,48,192,4096,128,8,0,0,asm,10001+20001,291.3578 +gfx936,int8_w8a8_channel,torch.float16,56,192,4096,128,8,0,0,asm,10001+20000,291.1558 +gfx936,int8_w8a8_channel,torch.float16,64,192,4096,128,8,0,0,asm,10001+20000,306.3335 +gfx936,int8_w8a8_channel,torch.float16,72,192,4096,128,8,0,0,asm,10001+20000,297.4295 +gfx936,int8_w8a8_channel,torch.float16,80,192,4096,128,8,0,0,asm,10001+20000,300.5621 +gfx936,int8_w8a8_channel,torch.float16,88,192,4096,128,8,0,0,asm,10001+20000,303.5936 +gfx936,int8_w8a8_channel,torch.float16,96,192,4096,128,8,0,0,asm,10001+20000,305.0337 +gfx936,int8_w8a8_channel,torch.float16,100,192,4096,128,8,0,0,asm,10001+20000,306.1537 +gfx936,int8_w8a8_channel,torch.float16,112,192,4096,128,8,0,0,asm,10001+20000,308.8231 +gfx936,int8_w8a8_channel,torch.float16,128,192,4096,128,8,0,0,asm,10001+20001,313.1853 +gfx936,int8_w8a8_channel,torch.float16,144,192,4096,128,8,0,0,asm,10001+20001,319.6357 +gfx936,int8_w8a8_channel,torch.float16,160,192,4096,128,8,0,0,asm,10001+20001,324.4863 +gfx936,int8_w8a8_channel,torch.float16,192,192,4096,128,8,0,0,asm,10006+20001,338.3978 +gfx936,int8_w8a8_channel,torch.float16,224,192,4096,128,8,0,0,asm,10007+20001,351.7957 +gfx936,int8_w8a8_channel,torch.float16,256,192,4096,128,8,0,0,asm,10007+20001,364.2756 +gfx936,int8_w8a8_channel,torch.float16,320,192,4096,128,8,0,0,asm,11000+21001,388.1662 +gfx936,int8_w8a8_channel,torch.float16,384,192,4096,128,8,0,0,asm,11000+21001,406.0188 +gfx936,int8_w8a8_channel,torch.float16,448,192,4096,128,8,0,0,asm,11006+21001,425.2861 +gfx936,int8_w8a8_channel,torch.float16,512,192,4096,128,8,0,0,asm,11006+21001,446.7596 +gfx936,int8_w8a8_channel,torch.float16,640,192,4096,128,8,0,0,asm,11004+21001,495.1384 +gfx936,int8_w8a8_channel,torch.float16,768,192,4096,128,8,0,0,asm,12004+22001,529.7151 +gfx936,int8_w8a8_channel,torch.float16,896,192,4096,128,8,0,0,asm,12004+22001,549.2014 +gfx936,int8_w8a8_channel,torch.float16,1024,192,4096,128,8,0,0,asm,12004+22001,599.2392 +gfx936,int8_w8a8_channel,torch.float16,1280,192,4096,128,8,0,0,asm,12001+22001,711.3232 +gfx936,int8_w8a8_channel,torch.float16,1536,192,4096,128,8,0,0,asm,12001+22001,757.8579 +gfx936,int8_w8a8_channel,torch.float16,2048,192,4096,128,8,0,0,asm,12001+22001,912.6535 +gfx936,int8_w8a8_channel,torch.float16,2304,192,4096,128,8,0,0,asm,12001+22001,1026.4976 +gfx936,int8_w8a8_channel,torch.float16,2560,192,4096,128,8,0,0,asm,12001+22001,1080.5522 +gfx936,int8_w8a8_channel,torch.float16,3072,192,4096,128,8,0,0,asm,13001+23001,1229.0909 +gfx936,int8_w8a8_channel,torch.float16,3584,192,4096,128,8,0,0,asm,13001+23001,1326.9518 +gfx936,int8_w8a8_channel,torch.float16,4096,192,4096,128,8,0,0,asm,13001+23001,1519.4988 +gfx936,int8_w8a8_channel,torch.float16,5120,192,4096,128,8,0,0,asm,13001+23001,1839.1447 +gfx936,int8_w8a8_channel,torch.float16,6144,192,4096,128,8,0,0,asm,13001+23001,2176.0198 +gfx936,int8_w8a8_channel,torch.float16,7168,192,4096,128,8,0,0,asm,13001+23001,2477.9225 +gfx936,int8_w8a8_channel,torch.float16,8192,192,4096,128,8,0,0,asm,13001+23001,2777.7368 +gfx936,int8_w8a8_channel,torch.float16,10240,192,4096,128,8,0,0,asm,13001+23001,3406.7548 +gfx936,int8_w8a8_channel,torch.float16,12288,192,4096,128,8,0,0,asm,13001+23001,4037.6253 +gfx936,int8_w8a8_channel,torch.float16,16384,192,4096,128,8,0,0,asm,13001+23001,5310.6843 +gfx936,int8_w8a8_channel,torch.float16,24576,192,4096,128,8,0,0,asm,13001+23001,7871.7663 +gfx936,int8_w8a8_channel,torch.float16,32768,192,4096,128,8,0,0,asm,13001+23001,10420.6629 +gfx936,int8_w8a8_channel,torch.float16,1,96,4096,128,8,0,0,asm,10000+20001,39.8919 +gfx936,int8_w8a8_channel,torch.float16,2,96,4096,128,8,0,0,asm,10000+20001,51.8832 +gfx936,int8_w8a8_channel,torch.float16,4,96,4096,128,8,0,0,asm,10001+20001,68.3214 +gfx936,int8_w8a8_channel,torch.float16,6,96,4096,128,8,0,0,asm,10004+20001,83.2014 +gfx936,int8_w8a8_channel,torch.float16,8,96,4096,128,8,0,0,asm,10001+20001,94.8645 +gfx936,int8_w8a8_channel,torch.float16,12,96,4096,128,8,0,0,asm,10001+20001,118.1485 +gfx936,int8_w8a8_channel,torch.float16,16,96,4096,128,8,0,0,asm,10001+20001,135.2013 +gfx936,int8_w8a8_channel,torch.float16,24,96,4096,128,8,0,0,asm,10001+20001,151.198 +gfx936,int8_w8a8_channel,torch.float16,32,96,4096,128,8,0,0,asm,10001+20001,163.2264 +gfx936,int8_w8a8_channel,torch.float16,36,96,4096,128,8,0,0,asm,10001+20001,160.5708 +gfx936,int8_w8a8_channel,torch.float16,48,96,4096,128,8,0,0,asm,10001+20001,177.8002 +gfx936,int8_w8a8_channel,torch.float16,56,96,4096,128,8,0,0,asm,10001+20001,178.8275 +gfx936,int8_w8a8_channel,torch.float16,64,96,4096,128,8,0,0,asm,10001+20001,193.2052 +gfx936,int8_w8a8_channel,torch.float16,72,96,4096,128,8,0,0,asm,10001+20001,184.4949 +gfx936,int8_w8a8_channel,torch.float16,80,96,4096,128,8,0,0,asm,10001+20001,183.1728 +gfx936,int8_w8a8_channel,torch.float16,88,96,4096,128,8,0,0,asm,10001+20001,186.7517 +gfx936,int8_w8a8_channel,torch.float16,96,96,4096,128,8,0,0,asm,10001+20001,190.8023 +gfx936,int8_w8a8_channel,torch.float16,100,96,4096,128,8,0,0,asm,10001+20001,188.0907 +gfx936,int8_w8a8_channel,torch.float16,112,96,4096,128,8,0,0,asm,10001+20001,191.3074 +gfx936,int8_w8a8_channel,torch.float16,128,96,4096,128,8,0,0,asm,10001+20001,197.4464 +gfx936,int8_w8a8_channel,torch.float16,144,96,4096,128,8,0,0,asm,10001+20001,198.1539 +gfx936,int8_w8a8_channel,torch.float16,160,96,4096,128,8,0,0,asm,10001+20001,203.0464 +gfx936,int8_w8a8_channel,torch.float16,192,96,4096,128,8,0,0,asm,10004+20001,212.4527 +gfx936,int8_w8a8_channel,torch.float16,224,96,4096,128,8,0,0,asm,10006+20001,227.6695 +gfx936,int8_w8a8_channel,torch.float16,256,96,4096,128,8,0,0,asm,10006+20001,235.0968 +gfx936,int8_w8a8_channel,torch.float16,320,96,4096,128,8,0,0,asm,11000+21001,257.539 +gfx936,int8_w8a8_channel,torch.float16,384,96,4096,128,8,0,0,asm,11000+21001,269.9684 +gfx936,int8_w8a8_channel,torch.float16,448,96,4096,128,8,0,0,asm,11004+21001,284.2084 +gfx936,int8_w8a8_channel,torch.float16,512,96,4096,128,8,0,0,asm,11004+21001,309.8589 +gfx936,int8_w8a8_channel,torch.float16,640,96,4096,128,8,0,0,asm,12004+22001,353.4631 +gfx936,int8_w8a8_channel,torch.float16,768,96,4096,128,8,0,0,asm,12004+22001,373.0925 +gfx936,int8_w8a8_channel,torch.float16,896,96,4096,128,8,0,0,asm,12000+22001,399.3156 +gfx936,int8_w8a8_channel,torch.float16,1024,96,4096,128,8,0,0,asm,12000+22001,445.0587 +gfx936,int8_w8a8_channel,torch.float16,1280,96,4096,128,8,0,0,asm,11005+21001,529.2775 +gfx936,int8_w8a8_channel,torch.float16,1536,96,4096,128,8,0,0,asm,12005+22001,581.3195 +gfx936,int8_w8a8_channel,torch.float16,2048,96,4096,128,8,0,0,asm,12005+22001,703.7529 +gfx936,int8_w8a8_channel,torch.float16,2304,96,4096,128,8,0,0,asm,12005+22001,797.5802 +gfx936,int8_w8a8_channel,torch.float16,2560,96,4096,128,8,0,0,asm,12005+22001,838.2538 +gfx936,int8_w8a8_channel,torch.float16,3072,96,4096,128,8,0,0,asm,12001+22001,962.8515 +gfx936,int8_w8a8_channel,torch.float16,3584,96,4096,128,8,0,0,asm,13001+23000,1089.4702 +gfx936,int8_w8a8_channel,torch.float16,4096,96,4096,128,8,0,0,asm,13001+23001,1229.4868 +gfx936,int8_w8a8_channel,torch.float16,5120,96,4096,128,8,0,0,asm,13001+23001,1491.0864 +gfx936,int8_w8a8_channel,torch.float16,6144,96,4096,128,8,0,0,asm,13001+23001,1771.3386 +gfx936,int8_w8a8_channel,torch.float16,7168,96,4096,128,8,0,0,asm,13001+23001,1998.9508 +gfx936,int8_w8a8_channel,torch.float16,8192,96,4096,128,8,0,0,asm,13001+23001,2247.9693 +gfx936,int8_w8a8_channel,torch.float16,10240,96,4096,128,8,0,0,asm,13001+23001,2772.4906 +gfx936,int8_w8a8_channel,torch.float16,12288,96,4096,128,8,0,0,asm,13001+23001,3282.9993 +gfx936,int8_w8a8_channel,torch.float16,16384,96,4096,128,8,0,0,asm,13001+23001,4321.1449 +gfx936,int8_w8a8_channel,torch.float16,24576,96,4096,128,8,0,0,asm,13001+23001,6393.1752 +gfx936,int8_w8a8_channel,torch.float16,32768,96,4096,128,8,0,0,asm,13001+23001,8450.1147 +gfx936,int8_w8a8_channel,torch.float16,1,256,3072,256,8,0,0,asm,10002+20000,48.5319 +gfx936,int8_w8a8_channel,torch.float16,2,256,3072,256,8,0,0,asm,10009+20000,64.8519 +gfx936,int8_w8a8_channel,torch.float16,4,256,3072,256,8,0,0,asm,10006+20000,97.4833 +gfx936,int8_w8a8_channel,torch.float16,6,256,3072,256,8,0,0,asm,10009+20000,123.5382 +gfx936,int8_w8a8_channel,torch.float16,8,256,3072,256,8,0,0,asm,10007+20000,148.1611 +gfx936,int8_w8a8_channel,torch.float16,12,256,3072,256,8,0,0,asm,10007+20001,195.8077 +gfx936,int8_w8a8_channel,torch.float16,16,256,3072,256,8,0,0,asm,10007+20000,230.0055 +gfx936,int8_w8a8_channel,torch.float16,24,256,3072,256,8,0,0,asm,10007+20001,293.6989 +gfx936,int8_w8a8_channel,torch.float16,32,256,3072,256,8,0,0,asm,10007+20001,360.8176 +gfx936,int8_w8a8_channel,torch.float16,36,256,3072,256,8,0,0,asm,10007+20001,368.5452 +gfx936,int8_w8a8_channel,torch.float16,48,256,3072,256,8,0,0,asm,10007+20000,410.6672 +gfx936,int8_w8a8_channel,torch.float16,56,256,3072,256,8,0,0,asm,10007+20001,434.7514 +gfx936,int8_w8a8_channel,torch.float16,64,256,3072,256,8,0,0,asm,10006+20000,449.4966 +gfx936,int8_w8a8_channel,torch.float16,72,256,3072,256,8,0,0,asm,10007+20001,465.8249 +gfx936,int8_w8a8_channel,torch.float16,80,256,3072,256,8,0,0,asm,10007+20001,480.9155 +gfx936,int8_w8a8_channel,torch.float16,88,256,3072,256,8,0,0,asm,10006+20001,495.1807 +gfx936,int8_w8a8_channel,torch.float16,96,256,3072,256,8,0,0,asm,10007+20001,504.3090 +gfx936,int8_w8a8_channel,torch.float16,100,256,3072,256,8,0,0,asm,10006+20001,503.5680 +gfx936,int8_w8a8_channel,torch.float16,112,256,3072,256,8,0,0,asm,10007+20001,513.446 +gfx936,int8_w8a8_channel,torch.float16,128,256,3072,256,8,0,0,asm,10006+20001,526.3723 +gfx936,int8_w8a8_channel,torch.float16,144,256,3072,256,8,0,0,asm,10006+20001,532.8817 +gfx936,int8_w8a8_channel,torch.float16,160,256,3072,256,8,0,0,asm,10007+20001,539.686 +gfx936,int8_w8a8_channel,torch.float16,192,256,3072,256,8,0,0,asm,10006+20001,547.0795 +gfx936,int8_w8a8_channel,torch.float16,224,256,3072,256,8,0,0,asm,10006+20001,557.4037 +gfx936,int8_w8a8_channel,torch.float16,256,256,3072,256,8,0,0,asm,10006+20001,565.4795 +gfx936,int8_w8a8_channel,torch.float16,320,256,3072,256,8,0,0,asm,10006+20001,582.7006 +gfx936,int8_w8a8_channel,torch.float16,384,256,3072,256,8,0,0,asm,11006+21001,602.8521 +gfx936,int8_w8a8_channel,torch.float16,448,256,3072,256,8,0,0,asm,11006+21001,615.5257 +gfx936,int8_w8a8_channel,torch.float16,512,256,3072,256,8,0,0,asm,11006+21001,632.2499 +gfx936,int8_w8a8_channel,torch.float16,640,256,3072,256,8,0,0,asm,11006+21001,656.0731 +gfx936,int8_w8a8_channel,torch.float16,768,256,3072,256,8,0,0,asm,11006+21001,683.6183 +gfx936,int8_w8a8_channel,torch.float16,896,256,3072,256,8,0,0,asm,11007+21001,723.3656 +gfx936,int8_w8a8_channel,torch.float16,1024,256,3072,256,8,0,0,asm,11007+21001,757.1508 +gfx936,int8_w8a8_channel,torch.float16,1280,256,3072,256,8,0,0,asm,12004+22001,816.3592 +gfx936,int8_w8a8_channel,torch.float16,1536,256,3072,256,8,0,0,asm,12004+22001,862.9191 +gfx936,int8_w8a8_channel,torch.float16,2048,256,3072,256,8,0,0,asm,12005+22001,1002.9019 +gfx936,int8_w8a8_channel,torch.float16,2304,256,3072,256,8,0,0,asm,12005+22001,1086.3881 +gfx936,int8_w8a8_channel,torch.float16,2560,256,3072,256,8,0,0,asm,13000+23001,1128.3502 +gfx936,int8_w8a8_channel,torch.float16,3072,256,3072,256,8,0,0,asm,13000+23001,1204.3164 +gfx936,int8_w8a8_channel,torch.float16,3584,256,3072,256,8,0,0,asm,13000+23001,1307.5498 +gfx936,int8_w8a8_channel,torch.float16,4096,256,3072,256,8,0,0,asm,13001+23001,1488.3075 +gfx936,int8_w8a8_channel,torch.float16,5120,256,3072,256,8,0,0,asm,12001+22001,1775.8017 +gfx936,int8_w8a8_channel,torch.float16,6144,256,3072,256,8,0,0,asm,13001+23001,1976.7277 +gfx936,int8_w8a8_channel,torch.float16,7168,256,3072,256,8,0,0,asm,13001+23001,2122.9169 +gfx936,int8_w8a8_channel,torch.float16,8192,256,3072,256,8,0,0,asm,13001+23001,2456.4827 +gfx936,int8_w8a8_channel,torch.float16,10240,256,3072,256,8,0,0,asm,13001+23001,2976.8103 +gfx936,int8_w8a8_channel,torch.float16,12288,256,3072,256,8,0,0,asm,13001+23001,3479.6053 +gfx936,int8_w8a8_channel,torch.float16,16384,256,3072,256,8,0,0,asm,13001+23001,4491.6122 +gfx936,int8_w8a8_channel,torch.float16,24576,256,3072,256,8,0,0,asm,13001+23001,6587.6761 +gfx936,int8_w8a8_channel,torch.float16,32768,256,3072,256,8,0,0,asm,13001+23001,8660.7336 +gfx936,int8_w8a8_channel,torch.float16,1,128,3072,256,8,0,0,asm,10000+20000,44.8266 +gfx936,int8_w8a8_channel,torch.float16,2,128,3072,256,8,0,0,asm,10002+20100,54.0391 +gfx936,int8_w8a8_channel,torch.float16,4,128,3072,256,8,0,0,asm,10002+20101,69.913 +gfx936,int8_w8a8_channel,torch.float16,6,128,3072,256,8,0,0,asm,10006+20102,85.8454 +gfx936,int8_w8a8_channel,torch.float16,8,128,3072,256,8,0,0,asm,10009+20102,100.2034 +gfx936,int8_w8a8_channel,torch.float16,12,128,3072,256,8,0,0,asm,10006+20001,127.706 +gfx936,int8_w8a8_channel,torch.float16,16,128,3072,256,8,0,0,asm,10007+20001,146.0643 +gfx936,int8_w8a8_channel,torch.float16,24,128,3072,256,8,0,0,asm,10007+20001,178.3224 +gfx936,int8_w8a8_channel,torch.float16,32,128,3072,256,8,0,0,asm,10006+20001,222.8306 +gfx936,int8_w8a8_channel,torch.float16,36,128,3072,256,8,0,0,asm,10007+20001,218.6591 +gfx936,int8_w8a8_channel,torch.float16,48,128,3072,256,8,0,0,asm,10007+20001,238.2969 +gfx936,int8_w8a8_channel,torch.float16,56,128,3072,256,8,0,0,asm,10007+20001,252.3937 +gfx936,int8_w8a8_channel,torch.float16,64,128,3072,256,8,0,0,asm,10007+20001,262.6758 +gfx936,int8_w8a8_channel,torch.float16,72,128,3072,256,8,0,0,asm,10007+20001,271.4337 +gfx936,int8_w8a8_channel,torch.float16,80,128,3072,256,8,0,0,asm,10007+20001,281.4379 +gfx936,int8_w8a8_channel,torch.float16,88,128,3072,256,8,0,0,asm,10007+20001,285.7495 +gfx936,int8_w8a8_channel,torch.float16,96,128,3072,256,8,0,0,asm,10006+20001,290.5159 +gfx936,int8_w8a8_channel,torch.float16,100,128,3072,256,8,0,0,asm,10007+20001,290.2801 +gfx936,int8_w8a8_channel,torch.float16,112,128,3072,256,8,0,0,asm,10006+20001,297.9432 +gfx936,int8_w8a8_channel,torch.float16,128,128,3072,256,8,0,0,asm,10006+20001,303.0295 +gfx936,int8_w8a8_channel,torch.float16,144,128,3072,256,8,0,0,asm,10007+20001,308.2674 +gfx936,int8_w8a8_channel,torch.float16,160,128,3072,256,8,0,0,asm,10006+20001,312.2084 +gfx936,int8_w8a8_channel,torch.float16,192,128,3072,256,8,0,0,asm,10007+20001,314.9031 +gfx936,int8_w8a8_channel,torch.float16,224,128,3072,256,8,0,0,asm,10006+20001,321.6316 +gfx936,int8_w8a8_channel,torch.float16,256,128,3072,256,8,0,0,asm,10006+20001,325.6231 +gfx936,int8_w8a8_channel,torch.float16,320,128,3072,256,8,0,0,asm,10006+20001,337.0926 +gfx936,int8_w8a8_channel,torch.float16,384,128,3072,256,8,0,0,asm,10006+20001,349.8673 +gfx936,int8_w8a8_channel,torch.float16,448,128,3072,256,8,0,0,asm,11006+21001,364.8736 +gfx936,int8_w8a8_channel,torch.float16,512,128,3072,256,8,0,0,asm,11006+21001,377.1262 +gfx936,int8_w8a8_channel,torch.float16,640,128,3072,256,8,0,0,asm,11002+21001,394.7514 +gfx936,int8_w8a8_channel,torch.float16,768,128,3072,256,8,0,0,asm,11006+21001,415.0208 +gfx936,int8_w8a8_channel,torch.float16,896,128,3072,256,8,0,0,asm,11006+21001,438.9619 +gfx936,int8_w8a8_channel,torch.float16,1024,128,3072,256,8,0,0,asm,11007+21001,469.1261 +gfx936,int8_w8a8_channel,torch.float16,1280,128,3072,256,8,0,0,asm,12004+21102,498.0102 +gfx936,int8_w8a8_channel,torch.float16,1536,128,3072,256,8,0,0,asm,12004+22001,547.5512 +gfx936,int8_w8a8_channel,torch.float16,2048,128,3072,256,8,0,0,asm,12005+22001,654.5574 +gfx936,int8_w8a8_channel,torch.float16,2304,128,3072,256,8,0,0,asm,12005+21102,716.8814 +gfx936,int8_w8a8_channel,torch.float16,2560,128,3072,256,8,0,0,asm,12005+21102,751.2982 +gfx936,int8_w8a8_channel,torch.float16,3072,128,3072,256,8,0,0,asm,12005+21102,821.4792 +gfx936,int8_w8a8_channel,torch.float16,3584,128,3072,256,8,0,0,asm,12005+22001,902.3379 +gfx936,int8_w8a8_channel,torch.float16,4096,128,3072,256,8,0,0,asm,12005+22001,1016.4852 +gfx936,int8_w8a8_channel,torch.float16,5120,128,3072,256,8,0,0,asm,12005+22001,1233.2848 +gfx936,int8_w8a8_channel,torch.float16,6144,128,3072,256,8,0,0,asm,13001+23001,1410.4297 +gfx936,int8_w8a8_channel,torch.float16,7168,128,3072,256,8,0,0,asm,13001+23001,1532.0043 +gfx936,int8_w8a8_channel,torch.float16,8192,128,3072,256,8,0,0,asm,13001+23001,1765.966 +gfx936,int8_w8a8_channel,torch.float16,10240,128,3072,256,8,0,0,asm,13001+23001,2124.0959 +gfx936,int8_w8a8_channel,torch.float16,12288,128,3072,256,8,0,0,asm,13001+23001,2490.6301 +gfx936,int8_w8a8_channel,torch.float16,16384,128,3072,256,8,0,0,asm,13001+21102,3151.1006 +gfx936,int8_w8a8_channel,torch.float16,24576,128,3072,256,8,0,0,asm,13001+23001,4696.4204 +gfx936,int8_w8a8_channel,torch.float16,32768,128,3072,256,8,0,0,asm,13001+23001,6188.8897 diff --git a/aiter/configs/tuned_fmoe_asm_w8a8_group.csv b/aiter/configs/tuned_fmoe_asm_w8a8_group.csv new file mode 100644 index 0000000000000000000000000000000000000000..a39e41f0c21de0b85f0f973a5a08005205090925 --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm_w8a8_group.csv @@ -0,0 +1,884 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,int8_w8a8_block,torch.float16,1,128,7168,256,8,0,0,asm,10008+20000,63.7404 +gfx936,int8_w8a8_block,torch.float16,2,128,7168,256,8,0,0,asm,10007+20101,79.8125 +gfx936,int8_w8a8_block,torch.float16,3,128,7168,256,8,0,0,asm,10002+20101,98.4464 +gfx936,int8_w8a8_block,torch.float16,4,128,7168,256,8,0,0,asm,10003+20101,116.8833 +gfx936,int8_w8a8_block,torch.float16,5,128,7168,256,8,0,0,asm,10006+20101,136.7469 +gfx936,int8_w8a8_block,torch.float16,6,128,7168,256,8,0,0,asm,10006+20101,163.1216 +gfx936,int8_w8a8_block,torch.float16,7,128,7168,256,8,0,0,asm,10006+20101,181.9707 +gfx936,int8_w8a8_block,torch.float16,8,128,7168,256,8,0,0,asm,10005+20101,195.1672 +gfx936,int8_w8a8_block,torch.float16,9,128,7168,256,8,0,0,asm,10005+20101,209.4906 +gfx936,int8_w8a8_block,torch.float16,10,128,7168,256,8,0,0,asm,10003+20101,225.9173 +gfx936,int8_w8a8_block,torch.float16,11,128,7168,256,8,0,0,asm,10003+20101,242.8814 +gfx936,int8_w8a8_block,torch.float16,12,128,7168,256,8,0,0,asm,10003+20101,255.8416 +gfx936,int8_w8a8_block,torch.float16,13,128,7168,256,8,0,0,asm,10007+20101,278.0101 +gfx936,int8_w8a8_block,torch.float16,14,128,7168,256,8,0,0,asm,10007+20101,289.544 +gfx936,int8_w8a8_block,torch.float16,15,128,7168,256,8,0,0,asm,10007+20101,299.8415 +gfx936,int8_w8a8_block,torch.float16,16,128,7168,256,8,0,0,asm,10007+20101,307.062 +gfx936,int8_w8a8_block,torch.float16,17,128,7168,256,8,0,0,asm,10007+20101,313.7174 +gfx936,int8_w8a8_block,torch.float16,18,128,7168,256,8,0,0,asm,10003+20101,323.5229 +gfx936,int8_w8a8_block,torch.float16,20,128,7168,256,8,0,0,asm,10003+20101,349.3318 +gfx936,int8_w8a8_block,torch.float16,24,128,7168,256,8,0,0,asm,10003+20000,404.6977 +gfx936,int8_w8a8_block,torch.float16,28,128,7168,256,8,0,0,asm,10003+20000,446.2247 +gfx936,int8_w8a8_block,torch.float16,32,128,7168,256,8,0,0,asm,10007+20000,493.6303 +gfx936,int8_w8a8_block,torch.float16,34,128,7168,256,8,0,0,asm,10003+20000,489.455 +gfx936,int8_w8a8_block,torch.float16,36,128,7168,256,8,0,0,asm,10003+20000,503.2231 +gfx936,int8_w8a8_block,torch.float16,40,128,7168,256,8,0,0,asm,10003+20000,517.8023 +gfx936,int8_w8a8_block,torch.float16,44,128,7168,256,8,0,0,asm,10003+20000,537.7135 +gfx936,int8_w8a8_block,torch.float16,48,128,7168,256,8,0,0,asm,10003+20000,554.3905 +gfx936,int8_w8a8_block,torch.float16,56,128,7168,256,8,0,0,asm,10003+20000,593.1802 +gfx936,int8_w8a8_block,torch.float16,64,128,7168,256,8,0,0,asm,10003+20000,606.4151 +gfx936,int8_w8a8_block,torch.float16,68,128,7168,256,8,0,0,asm,10003+20000,612.0233 +gfx936,int8_w8a8_block,torch.float16,72,128,7168,256,8,0,0,asm,10003+20000,624.0316 +gfx936,int8_w8a8_block,torch.float16,80,128,7168,256,8,0,0,asm,10003+20000,640.1463 +gfx936,int8_w8a8_block,torch.float16,88,128,7168,256,8,0,0,asm,10003+20000,659.9413 +gfx936,int8_w8a8_block,torch.float16,96,128,7168,256,8,0,0,asm,10003+20000,690.9444 +gfx936,int8_w8a8_block,torch.float16,104,128,7168,256,8,0,0,asm,10003+20000,696.5691 +gfx936,int8_w8a8_block,torch.float16,112,128,7168,256,8,0,0,asm,10003+20000,704.9834 +gfx936,int8_w8a8_block,torch.float16,128,128,7168,256,8,0,0,asm,10003+20000,712.97 +gfx936,int8_w8a8_block,torch.float16,144,128,7168,256,8,0,0,asm,10003+20000,722.0132 +gfx936,int8_w8a8_block,torch.float16,160,128,7168,256,8,0,0,asm,10003+20000,729.5175 +gfx936,int8_w8a8_block,torch.float16,192,128,7168,256,8,0,0,asm,10003+20000,740.1296 +gfx936,int8_w8a8_block,torch.float16,224,128,7168,256,8,0,0,asm,10003+20000,752.1505 +gfx936,int8_w8a8_block,torch.float16,256,128,7168,256,8,0,0,asm,10003+20000,768.2856 +gfx936,int8_w8a8_block,torch.float16,320,128,7168,256,8,0,0,asm,10003+20000,784.1603 +gfx936,int8_w8a8_block,torch.float16,384,128,7168,256,8,0,0,asm,10003+20000,830.0833 +gfx936,int8_w8a8_block,torch.float16,448,128,7168,256,8,0,0,asm,10003+20101,846.3959 +gfx936,int8_w8a8_block,torch.float16,512,128,7168,256,8,0,0,asm,10006+20101,865.5128 +gfx936,int8_w8a8_block,torch.float16,576,128,7168,256,8,0,0,asm,10006+20101,898.4684 +gfx936,int8_w8a8_block,torch.float16,640,128,7168,256,8,0,0,asm,10006+20101,893.4055 +gfx936,int8_w8a8_block,torch.float16,704,128,7168,256,8,0,0,asm,10006+20101,938.4277 +gfx936,int8_w8a8_block,torch.float16,768,128,7168,256,8,0,0,asm,10006+20101,953.3184 +gfx936,int8_w8a8_block,torch.float16,832,128,7168,256,8,0,0,asm,10006+20101,978.7358 +gfx936,int8_w8a8_block,torch.float16,896,128,7168,256,8,0,0,asm,11010+21000,1001.8931 +gfx936,int8_w8a8_block,torch.float16,960,128,7168,256,8,0,0,asm,11005+21101,1042.8664 +gfx936,int8_w8a8_block,torch.float16,1024,128,7168,256,8,0,0,asm,11010+21101,1034.9833 +gfx936,int8_w8a8_block,torch.float16,1152,128,7168,256,8,0,0,asm,11010+21101,1114.3541 +gfx936,int8_w8a8_block,torch.float16,1280,128,7168,256,8,0,0,asm,11010+21101,1184.8115 +gfx936,int8_w8a8_block,torch.float16,1408,128,7168,256,8,0,0,asm,11010+21101,1225.7346 +gfx936,int8_w8a8_block,torch.float16,1536,128,7168,256,8,0,0,asm,11010+21000,1249.6403 +gfx936,int8_w8a8_block,torch.float16,1664,128,7168,256,8,0,0,asm,11010+21000,1287.2281 +gfx936,int8_w8a8_block,torch.float16,1792,128,7168,256,8,0,0,asm,11010+21000,1338.4978 +gfx936,int8_w8a8_block,torch.float16,1920,128,7168,256,8,0,0,asm,11010+21000,1430.4692 +gfx936,int8_w8a8_block,torch.float16,2048,128,7168,256,8,0,0,asm,11010+21000,1474.3052 +gfx936,int8_w8a8_block,torch.float16,2304,128,7168,256,8,0,0,asm,11010+21000,1650.0345 +gfx936,int8_w8a8_block,torch.float16,2560,128,7168,256,8,0,0,asm,11010+21000,1784.6972 +gfx936,int8_w8a8_block,torch.float16,2816,128,7168,256,8,0,0,asm,12003+22000,1881.0829 +gfx936,int8_w8a8_block,torch.float16,3072,128,7168,256,8,0,0,asm,12003+22000,1954.5763 +gfx936,int8_w8a8_block,torch.float16,3328,128,7168,256,8,0,0,asm,12003+22000,2018.8923 +gfx936,int8_w8a8_block,torch.float16,3584,128,7168,256,8,0,0,asm,12002+22000,2090.7939 +gfx936,int8_w8a8_block,torch.float16,3840,128,7168,256,8,0,0,asm,13001+23000,2233.9531 +gfx936,int8_w8a8_block,torch.float16,4096,128,7168,256,8,0,0,asm,12003+22000,2389.9122 +gfx936,int8_w8a8_block,torch.float16,4608,128,7168,256,8,0,0,asm,12003+22000,2796.4766 +gfx936,int8_w8a8_block,torch.float16,5120,128,7168,256,8,0,0,asm,12003+22000,2925.479 +gfx936,int8_w8a8_block,torch.float16,5632,128,7168,256,8,0,0,asm,12003+22000,3117.6873 +gfx936,int8_w8a8_block,torch.float16,6144,128,7168,256,8,0,0,asm,13001+23000,3331.6854 +gfx936,int8_w8a8_block,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23000,3475.6528 +gfx936,int8_w8a8_block,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23000,3553.7657 +gfx936,int8_w8a8_block,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23000,3742.4152 +gfx936,int8_w8a8_block,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23000,4155.1571 +gfx936,int8_w8a8_block,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23000,5095.7734 +gfx936,int8_w8a8_block,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23000,5989.5664 +gfx936,int8_w8a8_block,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23000,6841.5631 +gfx936,int8_w8a8_block,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23000,7769.2863 +gfx936,int8_w8a8_block,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23000,8326.1077 +gfx936,int8_w8a8_block,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23000,11274.742 +gfx936,int8_w8a8_block,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23000,14891.7267 +gfx936,int8_w8a8_block,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,14976.8469 +gfx936,int8_w8a8_block,torch.float16,1,256,7168,256,8,0,0,asm,10007+20000,80.2568 +gfx936,int8_w8a8_block,torch.float16,2,256,7168,256,8,0,0,asm,10003+20000,122.6887 +gfx936,int8_w8a8_block,torch.float16,3,256,7168,256,8,0,0,asm,10006+20000,172.0341 +gfx936,int8_w8a8_block,torch.float16,4,256,7168,256,8,0,0,asm,10003+20000,217.8065 +gfx936,int8_w8a8_block,torch.float16,5,256,7168,256,8,0,0,asm,10007+20000,248.4376 +gfx936,int8_w8a8_block,torch.float16,6,256,7168,256,8,0,0,asm,10007+20000,282.8901 +gfx936,int8_w8a8_block,torch.float16,7,256,7168,256,8,0,0,asm,10003+20000,317.7122 +gfx936,int8_w8a8_block,torch.float16,8,256,7168,256,8,0,0,asm,10003+20000,336.703 +gfx936,int8_w8a8_block,torch.float16,9,256,7168,256,8,0,0,asm,10007+20000,374.1512 +gfx936,int8_w8a8_block,torch.float16,10,256,7168,256,8,0,0,asm,10003+20000,398.1841 +gfx936,int8_w8a8_block,torch.float16,11,256,7168,256,8,0,0,asm,10003+20000,418.7215 +gfx936,int8_w8a8_block,torch.float16,12,256,7168,256,8,0,0,asm,10003+20000,443.9523 +gfx936,int8_w8a8_block,torch.float16,13,256,7168,256,8,0,0,asm,10007+20000,474.1513 +gfx936,int8_w8a8_block,torch.float16,14,256,7168,256,8,0,0,asm,10003+20000,506.9882 +gfx936,int8_w8a8_block,torch.float16,15,256,7168,256,8,0,0,asm,10003+20000,518.1636 +gfx936,int8_w8a8_block,torch.float16,16,256,7168,256,8,0,0,asm,10003+20000,533.6549 +gfx936,int8_w8a8_block,torch.float16,17,256,7168,256,8,0,0,asm,10007+20000,555.4758 +gfx936,int8_w8a8_block,torch.float16,18,256,7168,256,8,0,0,asm,10007+20000,577.8488 +gfx936,int8_w8a8_block,torch.float16,20,256,7168,256,8,0,0,asm,10003+20000,614.095 +gfx936,int8_w8a8_block,torch.float16,24,256,7168,256,8,0,0,asm,10003+20000,700.5279 +gfx936,int8_w8a8_block,torch.float16,28,256,7168,256,8,0,0,asm,10003+20000,793.3022 +gfx936,int8_w8a8_block,torch.float16,32,256,7168,256,8,0,0,asm,10003+20000,863.8623 +gfx936,int8_w8a8_block,torch.float16,34,256,7168,256,8,0,0,asm,10003+20000,862.3536 +gfx936,int8_w8a8_block,torch.float16,36,256,7168,256,8,0,0,asm,10003+20000,889.8244 +gfx936,int8_w8a8_block,torch.float16,40,256,7168,256,8,0,0,asm,10003+20000,942.5301 +gfx936,int8_w8a8_block,torch.float16,44,256,7168,256,8,0,0,asm,10003+20000,974.4275 +gfx936,int8_w8a8_block,torch.float16,48,256,7168,256,8,0,0,asm,10003+20000,1000.9915 +gfx936,int8_w8a8_block,torch.float16,56,256,7168,256,8,0,0,asm,10003+20000,1061.9148 +gfx936,int8_w8a8_block,torch.float16,64,256,7168,256,8,0,0,asm,10003+20000,1100.0501 +gfx936,int8_w8a8_block,torch.float16,68,256,7168,256,8,0,0,asm,10003+20000,1111.5972 +gfx936,int8_w8a8_block,torch.float16,72,256,7168,256,8,0,0,asm,10003+20000,1149.8897 +gfx936,int8_w8a8_block,torch.float16,80,256,7168,256,8,0,0,asm,10003+20000,1174.1255 +gfx936,int8_w8a8_block,torch.float16,88,256,7168,256,8,0,0,asm,10003+20000,1207.7073 +gfx936,int8_w8a8_block,torch.float16,96,256,7168,256,8,0,0,asm,10003+20000,1241.1748 +gfx936,int8_w8a8_block,torch.float16,104,256,7168,256,8,0,0,asm,10003+20000,1256.1221 +gfx936,int8_w8a8_block,torch.float16,112,256,7168,256,8,0,0,asm,10003+20000,1267.3836 +gfx936,int8_w8a8_block,torch.float16,128,256,7168,256,8,0,0,asm,10003+20000,1294.4442 +gfx936,int8_w8a8_block,torch.float16,144,256,7168,256,8,0,0,asm,10003+20000,1309.413 +gfx936,int8_w8a8_block,torch.float16,160,256,7168,256,8,0,0,asm,10003+20000,1331.2852 +gfx936,int8_w8a8_block,torch.float16,192,256,7168,256,8,0,0,asm,10003+20000,1338.8944 +gfx936,int8_w8a8_block,torch.float16,224,256,7168,256,8,0,0,asm,10003+20000,1367.4452 +gfx936,int8_w8a8_block,torch.float16,256,256,7168,256,8,0,0,asm,10003+20000,1382.0245 +gfx936,int8_w8a8_block,torch.float16,320,256,7168,256,8,0,0,asm,10003+20000,1411.4382 +gfx936,int8_w8a8_block,torch.float16,384,256,7168,256,8,0,0,asm,10003+20000,1446.7848 +gfx936,int8_w8a8_block,torch.float16,448,256,7168,256,8,0,0,asm,10002+20000,1488.5131 +gfx936,int8_w8a8_block,torch.float16,512,256,7168,256,8,0,0,asm,10001+20000,1510.4197 +gfx936,int8_w8a8_block,torch.float16,576,256,7168,256,8,0,0,asm,10001+20000,1556.0915 +gfx936,int8_w8a8_block,torch.float16,640,256,7168,256,8,0,0,asm,10006+20000,1567.0515 +gfx936,int8_w8a8_block,torch.float16,704,256,7168,256,8,0,0,asm,10006+20000,1610.1326 +gfx936,int8_w8a8_block,torch.float16,768,256,7168,256,8,0,0,asm,10005+20000,1635.4823 +gfx936,int8_w8a8_block,torch.float16,832,256,7168,256,8,0,0,asm,10006+20000,1682.6332 +gfx936,int8_w8a8_block,torch.float16,896,256,7168,256,8,0,0,asm,11010+21000,1697.1069 +gfx936,int8_w8a8_block,torch.float16,960,256,7168,256,8,0,0,asm,11010+21000,1700.2086 +gfx936,int8_w8a8_block,torch.float16,1024,256,7168,256,8,0,0,asm,11010+21000,1720.3522 +gfx936,int8_w8a8_block,torch.float16,1152,256,7168,256,8,0,0,asm,11010+21000,1778.3173 +gfx936,int8_w8a8_block,torch.float16,1280,256,7168,256,8,0,0,asm,11010+21000,1827.6876 +gfx936,int8_w8a8_block,torch.float16,1408,256,7168,256,8,0,0,asm,11010+21000,1872.7174 +gfx936,int8_w8a8_block,torch.float16,1536,256,7168,256,8,0,0,asm,11010+21000,1895.9745 +gfx936,int8_w8a8_block,torch.float16,1664,256,7168,256,8,0,0,asm,12002+22000,2006.5225 +gfx936,int8_w8a8_block,torch.float16,1792,256,7168,256,8,0,0,asm,11010+21000,2046.6170 +gfx936,int8_w8a8_block,torch.float16,1920,256,7168,256,8,0,0,asm,11010+21000,2121.2345 +gfx936,int8_w8a8_block,torch.float16,2048,256,7168,256,8,0,0,asm,11010+21000,2219.9136 +gfx936,int8_w8a8_block,torch.float16,2304,256,7168,256,8,0,0,asm,12003+22000,2360.6029 +gfx936,int8_w8a8_block,torch.float16,2560,256,7168,256,8,0,0,asm,12003+22000,2489.4727 +gfx936,int8_w8a8_block,torch.float16,2816,256,7168,256,8,0,0,asm,12003+22000,2548.1644 +gfx936,int8_w8a8_block,torch.float16,3072,256,7168,256,8,0,0,asm,12003+22000,2617.9286 +gfx936,int8_w8a8_block,torch.float16,3328,256,7168,256,8,0,0,asm,12002+22000,2725.5706 +gfx936,int8_w8a8_block,torch.float16,3584,256,7168,256,8,0,0,asm,12002+22000,2857.6904 +gfx936,int8_w8a8_block,torch.float16,3840,256,7168,256,8,0,0,asm,12003+22000,3012.7306 +gfx936,int8_w8a8_block,torch.float16,4096,256,7168,256,8,0,0,asm,12003+22000,3240.8408 +gfx936,int8_w8a8_block,torch.float16,4608,256,7168,256,8,0,0,asm,12002+22000,3771.4593 +gfx936,int8_w8a8_block,torch.float16,5120,256,7168,256,8,0,0,asm,12003+22000,3994.7498 +gfx936,int8_w8a8_block,torch.float16,5632,256,7168,256,8,0,0,asm,12003+22000,4164.8792 +gfx936,int8_w8a8_block,torch.float16,6144,256,7168,256,8,0,0,asm,12003+22000,4594.9062 +gfx936,int8_w8a8_block,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23000,4701.4746 +gfx936,int8_w8a8_block,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23000,4834.3727 +gfx936,int8_w8a8_block,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23000,5090.3775 +gfx936,int8_w8a8_block,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23000,5702.3206 +gfx936,int8_w8a8_block,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23000,6923.0831 +gfx936,int8_w8a8_block,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23000,7896.9686 +gfx936,int8_w8a8_block,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23000,9011.7101 +gfx936,int8_w8a8_block,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23000,10138.9121 +gfx936,int8_w8a8_block,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23000,10933.3982 +gfx936,int8_w8a8_block,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23000,14921.2994 +gfx936,int8_w8a8_block,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23000,19669.379 +gfx938,f8_w8a8_block,torch.float16,1,640,6144,160,8,0,0,asm,10004+20000,114.4839 +gfx938,f8_w8a8_block,torch.float16,2,640,6144,160,8,0,0,asm,10002+20000,182.6896 +gfx938,f8_w8a8_block,torch.float16,4,640,6144,160,8,0,0,asm,10002+20000,296.1749 +gfx938,f8_w8a8_block,torch.float16,6,640,6144,160,8,0,0,asm,10002+20000,396.2887 +gfx938,f8_w8a8_block,torch.float16,8,640,6144,160,8,0,0,asm,10006+20000,443.7173 +gfx938,f8_w8a8_block,torch.float16,10,640,6144,160,8,0,0,asm,10006+20000,523.6483 +gfx938,f8_w8a8_block,torch.float16,12,640,6144,160,8,0,0,asm,10006+20000,708.6077 +gfx938,f8_w8a8_block,torch.float16,14,640,6144,160,8,0,0,asm,10006+20000,709.362 +gfx938,f8_w8a8_block,torch.float16,16,640,6144,160,8,0,0,asm,10006+20000,753.3843 +gfx938,f8_w8a8_block,torch.float16,20,640,6144,160,8,0,0,asm,10006+20000,846.0213 +gfx938,f8_w8a8_block,torch.float16,24,640,6144,160,8,0,0,asm,10006+20000,924.3524 +gfx938,f8_w8a8_block,torch.float16,28,640,6144,160,8,0,0,asm,10006+20000,1038.6179 +gfx938,f8_w8a8_block,torch.float16,32,640,6144,160,8,0,0,asm,10006+20000,1145.2919 +gfx938,f8_w8a8_block,torch.float16,36,640,6144,160,8,0,0,asm,10006+20000,1136.6487 +gfx938,f8_w8a8_block,torch.float16,40,640,6144,160,8,0,0,asm,10006+20000,1221.997 +gfx938,f8_w8a8_block,torch.float16,44,640,6144,160,8,0,0,asm,10006+20000,1231.3686 +gfx938,f8_w8a8_block,torch.float16,48,640,6144,160,8,0,0,asm,10006+20000,1222.317 +gfx938,f8_w8a8_block,torch.float16,56,640,6144,160,8,0,0,asm,10006+20000,1245.2197 +gfx938,f8_w8a8_block,torch.float16,64,640,6144,160,8,0,0,asm,10006+20000,1321.0139 +gfx938,f8_w8a8_block,torch.float16,72,640,6144,160,8,0,0,asm,10006+20000,1333.0137 +gfx938,f8_w8a8_block,torch.float16,80,640,6144,160,8,0,0,asm,10006+20000,1342.5223 +gfx938,f8_w8a8_block,torch.float16,96,640,6144,160,8,0,0,asm,10006+20000,1349.2423 +gfx938,f8_w8a8_block,torch.float16,112,640,6144,160,8,0,0,asm,10006+20000,1356.7167 +gfx938,f8_w8a8_block,torch.float16,128,640,6144,160,8,0,0,asm,10006+20000,1364.168 +gfx938,f8_w8a8_block,torch.float16,160,640,6144,160,8,0,0,asm,10005+20000,1400.0537 +gfx938,f8_w8a8_block,torch.float16,192,640,6144,160,8,0,0,asm,10006+20000,1375.208 +gfx938,f8_w8a8_block,torch.float16,224,640,6144,160,8,0,0,asm,10006+20000,1382.2936 +gfx938,f8_w8a8_block,torch.float16,256,640,6144,160,8,0,0,asm,11010+21000,1437.9049 +gfx938,f8_w8a8_block,torch.float16,320,640,6144,160,8,0,0,asm,11010+21000,1438.979 +gfx938,f8_w8a8_block,torch.float16,384,640,6144,160,8,0,0,asm,11010+21000,1464.3047 +gfx938,f8_w8a8_block,torch.float16,448,640,6144,160,8,0,0,asm,11010+21000,1474.1106 +gfx938,f8_w8a8_block,torch.float16,512,640,6144,160,8,0,0,asm,11010+21000,1570.43 +gfx938,f8_w8a8_block,torch.float16,576,640,6144,160,8,0,0,asm,11010+21000,1654.9783 +gfx938,f8_w8a8_block,torch.float16,640,640,6144,160,8,0,0,asm,12003+22000,1710.4297 +gfx938,f8_w8a8_block,torch.float16,704,640,6144,160,8,0,0,asm,12003+22000,1718.4066 +gfx938,f8_w8a8_block,torch.float16,768,640,6144,160,8,0,0,asm,12003+22000,1753.5607 +gfx938,f8_w8a8_block,torch.float16,832,640,6144,160,8,0,0,asm,12003+22000,1732.2353 +gfx938,f8_w8a8_block,torch.float16,896,640,6144,160,8,0,0,asm,12003+22000,1750.4294 +gfx938,f8_w8a8_block,torch.float16,960,640,6144,160,8,0,0,asm,12003+22000,1779.0921 +gfx938,f8_w8a8_block,torch.float16,1024,640,6144,160,8,0,0,asm,12003+22000,1807.4806 +gfx938,f8_w8a8_block,torch.float16,1152,640,6144,160,8,0,0,asm,12003+22000,2098.5427 +gfx938,f8_w8a8_block,torch.float16,1280,640,6144,160,8,0,0,asm,12003+22000,2514.1524 +gfx938,f8_w8a8_block,torch.float16,1408,640,6144,160,8,0,0,asm,13001+23000,2827.8653 +gfx938,f8_w8a8_block,torch.float16,1536,640,6144,160,8,0,0,asm,13001+23000,2882.8137 +gfx938,f8_w8a8_block,torch.float16,1664,640,6144,160,8,0,0,asm,13001+23000,2898.928 +gfx938,f8_w8a8_block,torch.float16,1792,640,6144,160,8,0,0,asm,13001+23000,2948.482 +gfx938,f8_w8a8_block,torch.float16,1920,640,6144,160,8,0,0,asm,13001+23000,2973.7393 +gfx938,f8_w8a8_block,torch.float16,2048,640,6144,160,8,0,0,asm,13001+23000,3015.4761 +gfx938,f8_w8a8_block,torch.float16,2304,640,6144,160,8,0,0,asm,13001+23000,3273.1666 +gfx938,f8_w8a8_block,torch.float16,2560,640,6144,160,8,0,0,asm,13001+23000,4072.615 +gfx938,f8_w8a8_block,torch.float16,2816,640,6144,160,8,0,0,asm,12003+22000,4727.9041 +gfx938,f8_w8a8_block,torch.float16,3072,640,6144,160,8,0,0,asm,12003+22000,5005.6631 +gfx938,f8_w8a8_block,torch.float16,3328,640,6144,160,8,0,0,asm,12002+22000,5183.2396 +gfx938,f8_w8a8_block,torch.float16,3584,640,6144,160,8,0,0,asm,12003+22000,5355.6046 +gfx938,f8_w8a8_block,torch.float16,3840,640,6144,160,8,0,0,asm,13001+23000,5496.1756 +gfx938,f8_w8a8_block,torch.float16,4096,640,6144,160,8,0,0,asm,13001+23000,5534.8724 +gfx938,f8_w8a8_block,torch.float16,4608,640,6144,160,8,0,0,asm,13001+23000,5797.9573 +gfx938,f8_w8a8_block,torch.float16,5120,640,6144,160,8,0,0,asm,13001+23000,6765.4964 +gfx938,f8_w8a8_block,torch.float16,5632,640,6144,160,8,0,0,asm,13001+23000,7835.6411 +gfx938,f8_w8a8_block,torch.float16,6144,640,6144,160,8,0,0,asm,13001+23000,8137.0799 +gfx938,f8_w8a8_block,torch.float16,6656,640,6144,160,8,0,0,asm,13001+23000,8280.3481 +gfx938,f8_w8a8_block,torch.float16,6144,640,6144,160,8,0,0,asm,13001+23000,8145.354299999999 +gfx938,f8_w8a8_block,torch.float16,7680,640,6144,160,8,0,0,asm,13001+23000,9456.8466 +gfx938,f8_w8a8_block,torch.float16,8192,640,6144,160,8,0,0,asm,13001+23000,10519.174 +gfx938,f8_w8a8_block,torch.float16,10240,640,6144,160,8,0,0,asm,13001+23000,12172.1051 +gfx938,f8_w8a8_block,torch.float16,12288,640,6144,160,8,0,0,asm,13001+23000,14116.4186 +gfx938,f8_w8a8_block,torch.float16,14336,640,6144,160,8,0,0,asm,13001+23000,16463.9983 +gfx938,f8_w8a8_block,torch.float16,16384,640,6144,160,8,0,0,asm,13001+23000,19025.4974 +gfx938,f8_w8a8_block,torch.float16,17408,640,6144,160,8,0,0,asm,13001+23000,19739.369 +gfx938,f8_w8a8_block,torch.float16,24576,640,6144,160,8,0,0,asm,14001+24000,27498.6319 +gfx938,f8_w8a8_block,torch.float16,32768,640,6144,160,8,0,0,asm,13001+23000,36345.2053 +gfx938,f8_w8a8_block,torch.float16,40960,640,6144,160,8,0,0,asm,14001+24000,45806.7474 +gfx938,f8_w8a8_block,torch.float16,49152,640,6144,160,8,0,0,asm,13001+23000,52580.0251 +gfx938,f8_w8a8_block,torch.float16,49152,640,6144,160,8,0,0,asm,13001+23000,52522.882 +gfx938,f8_w8a8_block,torch.float16,57344,640,6144,160,8,0,0,asm,13001+23000,61427.7869 +gfx938,f8_w8a8_block,torch.float16,65536,640,6144,160,8,0,0,asm,13001+23000,70414.4733 +gfx938,int8_w8a8_block,torch.float16,1,128,7168,256,8,0,0,asm,10008+20100,65.3628 +gfx938,int8_w8a8_block,torch.float16,2,128,7168,256,8,0,0,asm,10007+20101,75.4754 +gfx938,int8_w8a8_block,torch.float16,3,128,7168,256,8,0,0,asm,10002+20101,87.4921 +gfx938,int8_w8a8_block,torch.float16,4,128,7168,256,8,0,0,asm,10002+20101,103.5248 +gfx938,int8_w8a8_block,torch.float16,5,128,7168,256,8,0,0,asm,10005+20101,132.4518 +gfx938,int8_w8a8_block,torch.float16,6,128,7168,256,8,0,0,asm,10006+20101,133.4817 +gfx938,int8_w8a8_block,torch.float16,7,128,7168,256,8,0,0,asm,10006+20101,140.1565 +gfx938,int8_w8a8_block,torch.float16,8,128,7168,256,8,0,0,asm,10006+20101,147.0241 +gfx938,int8_w8a8_block,torch.float16,9,128,7168,256,8,0,0,asm,10005+20101,162.4947 +gfx938,int8_w8a8_block,torch.float16,10,128,7168,256,8,0,0,asm,10002+20101,177.8177 +gfx938,int8_w8a8_block,torch.float16,11,128,7168,256,8,0,0,asm,10007+20101,212.4555 +gfx938,int8_w8a8_block,torch.float16,12,128,7168,256,8,0,0,asm,10007+20101,214.556 +gfx938,int8_w8a8_block,torch.float16,13,128,7168,256,8,0,0,asm,10007+20101,219.7253 +gfx938,int8_w8a8_block,torch.float16,14,128,7168,256,8,0,0,asm,10002+20101,223.9386 +gfx938,int8_w8a8_block,torch.float16,15,128,7168,256,8,0,0,asm,10002+20101,224.7386 +gfx938,int8_w8a8_block,torch.float16,16,128,7168,256,8,0,0,asm,10002+20101,228.4719 +gfx938,int8_w8a8_block,torch.float16,17,128,7168,256,8,0,0,asm,10002+20101,234.594 +gfx938,int8_w8a8_block,torch.float16,18,128,7168,256,8,0,0,asm,11006+21101,243.9848 +gfx938,int8_w8a8_block,torch.float16,20,128,7168,256,8,0,0,asm,10006+20101,258.721 +gfx938,int8_w8a8_block,torch.float16,24,128,7168,256,8,0,0,asm,10006+20101,278.7332 +gfx938,int8_w8a8_block,torch.float16,28,128,7168,256,8,0,0,asm,10002+20101,356.5123 +gfx938,int8_w8a8_block,torch.float16,32,128,7168,256,8,0,0,asm,10002+20101,362.3954 +gfx938,int8_w8a8_block,torch.float16,34,128,7168,256,8,0,0,asm,10002+20101,350.9258 +gfx938,int8_w8a8_block,torch.float16,36,128,7168,256,8,0,0,asm,10001+20101,362.5073 +gfx938,int8_w8a8_block,torch.float16,40,128,7168,256,8,0,0,asm,10006+20101,378.4456 +gfx938,int8_w8a8_block,torch.float16,44,128,7168,256,8,0,0,asm,10006+20101,382.5933 +gfx938,int8_w8a8_block,torch.float16,48,128,7168,256,8,0,0,asm,10006+20101,389.781 +gfx938,int8_w8a8_block,torch.float16,56,128,7168,256,8,0,0,asm,10006+20101,405.1448 +gfx938,int8_w8a8_block,torch.float16,64,128,7168,256,8,0,0,asm,10001+20101,450.9047 +gfx938,int8_w8a8_block,torch.float16,68,128,7168,256,8,0,0,asm,10002+20101,467.5835 +gfx938,int8_w8a8_block,torch.float16,72,128,7168,256,8,0,0,asm,10001+20101,487.1343 +gfx938,int8_w8a8_block,torch.float16,80,128,7168,256,8,0,0,asm,10002+20101,494.9825 +gfx938,int8_w8a8_block,torch.float16,88,128,7168,256,8,0,0,asm,10002+20101,485.7603 +gfx938,int8_w8a8_block,torch.float16,96,128,7168,256,8,0,0,asm,10001+20101,489.641 +gfx938,int8_w8a8_block,torch.float16,104,128,7168,256,8,0,0,asm,10006+20101,493.7969 +gfx938,int8_w8a8_block,torch.float16,112,128,7168,256,8,0,0,asm,10006+20101,498.8307 +gfx938,int8_w8a8_block,torch.float16,128,128,7168,256,8,0,0,asm,10006+20101,499.3353 +gfx938,int8_w8a8_block,torch.float16,144,128,7168,256,8,0,0,asm,10006+20101,518.006 +gfx938,int8_w8a8_block,torch.float16,160,128,7168,256,8,0,0,asm,10006+20101,516.6111 +gfx938,int8_w8a8_block,torch.float16,192,128,7168,256,8,0,0,asm,10006+20101,526.847 +gfx938,int8_w8a8_block,torch.float16,224,128,7168,256,8,0,0,asm,10006+20101,529.85 +gfx938,int8_w8a8_block,torch.float16,256,128,7168,256,8,0,0,asm,10006+20101,536.2951 +gfx938,int8_w8a8_block,torch.float16,320,128,7168,256,8,0,0,asm,10006+20101,550.3832 +gfx938,int8_w8a8_block,torch.float16,384,128,7168,256,8,0,0,asm,10006+20101,554.535 +gfx938,int8_w8a8_block,torch.float16,448,128,7168,256,8,0,0,asm,10006+20101,604.2578 +gfx938,int8_w8a8_block,torch.float16,512,128,7168,256,8,0,0,asm,11010+21101,625.9029 +gfx938,int8_w8a8_block,torch.float16,576,128,7168,256,8,0,0,asm,11010+21101,643.6505 +gfx938,int8_w8a8_block,torch.float16,640,128,7168,256,8,0,0,asm,11010+21101,655.6136 +gfx938,int8_w8a8_block,torch.float16,704,128,7168,256,8,0,0,asm,11010+21101,664.6596 +gfx938,int8_w8a8_block,torch.float16,768,128,7168,256,8,0,0,asm,11010+21101,669.9929 +gfx938,int8_w8a8_block,torch.float16,832,128,7168,256,8,0,0,asm,11010+21101,671.3098 +gfx938,int8_w8a8_block,torch.float16,896,128,7168,256,8,0,0,asm,11010+21101,736.6869 +gfx938,int8_w8a8_block,torch.float16,960,128,7168,256,8,0,0,asm,11010+21101,750.3509 +gfx938,int8_w8a8_block,torch.float16,1024,128,7168,256,8,0,0,asm,11010+21101,828.613 +gfx938,int8_w8a8_block,torch.float16,1152,128,7168,256,8,0,0,asm,11010+21101,947.3887 +gfx938,int8_w8a8_block,torch.float16,1280,128,7168,256,8,0,0,asm,11010+21101,989.7032 +gfx938,int8_w8a8_block,torch.float16,1408,128,7168,256,8,0,0,asm,11010+21101,1027.1273 +gfx938,int8_w8a8_block,torch.float16,1536,128,7168,256,8,0,0,asm,12003+22000,1114.4521 +gfx938,int8_w8a8_block,torch.float16,1664,128,7168,256,8,0,0,asm,11010+21101,1140.0913 +gfx938,int8_w8a8_block,torch.float16,1792,128,7168,256,8,0,0,asm,11010+21101,1184.5465 +gfx938,int8_w8a8_block,torch.float16,1920,128,7168,256,8,0,0,asm,11010+21101,1273.1973 +gfx938,int8_w8a8_block,torch.float16,2048,128,7168,256,8,0,0,asm,11010+21101,1336.6726 +gfx938,int8_w8a8_block,torch.float16,2304,128,7168,256,8,0,0,asm,11010+21101,1539.3524 +gfx938,int8_w8a8_block,torch.float16,2560,128,7168,256,8,0,0,asm,11010+21101,1627.5191 +gfx938,int8_w8a8_block,torch.float16,2816,128,7168,256,8,0,0,asm,11010+21101,1752.9816 +gfx938,int8_w8a8_block,torch.float16,3072,128,7168,256,8,0,0,asm,13001+23000,1901.1188 +gfx938,int8_w8a8_block,torch.float16,3328,128,7168,256,8,0,0,asm,13001+23000,1962.5907 +gfx938,int8_w8a8_block,torch.float16,3584,128,7168,256,8,0,0,asm,13001+23000,2056.3314 +gfx938,int8_w8a8_block,torch.float16,3840,128,7168,256,8,0,0,asm,11010+21101,2270.6892 +gfx938,int8_w8a8_block,torch.float16,4096,128,7168,256,8,0,0,asm,11010+21101,2408.8532 +gfx938,int8_w8a8_block,torch.float16,4608,128,7168,256,8,0,0,asm,11010+21101,2691.0561 +gfx938,int8_w8a8_block,torch.float16,5120,128,7168,256,8,0,0,asm,11010+21101,2965.5062 +gfx938,int8_w8a8_block,torch.float16,5632,128,7168,256,8,0,0,asm,11010+21101,3251.9283 +gfx938,int8_w8a8_block,torch.float16,6144,128,7168,256,8,0,0,asm,11010+21101,3517.052 +gfx938,int8_w8a8_block,torch.float16,6656,128,7168,256,8,0,0,asm,11010+21101,3746.5233 +gfx938,int8_w8a8_block,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23000,3851.4442 +gfx938,int8_w8a8_block,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23000,4036.2064 +gfx938,int8_w8a8_block,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23000,4501.6442 +gfx938,int8_w8a8_block,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23000,5499.8329 +gfx938,int8_w8a8_block,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23000,6461.2996 +gfx938,int8_w8a8_block,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23000,7436.1102 +gfx938,int8_w8a8_block,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23000,8314.3413 +gfx938,int8_w8a8_block,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23000,9069.6552 +gfx938,int8_w8a8_block,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23000,12217.4681 +gfx938,int8_w8a8_block,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23000,16002.2442 +gfx938,int8_w8a8_block,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15681.4438 +gfx938,int8_w8a8_block,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23000,23121.5792 +gfx938,int8_w8a8_block,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23000,27056.9822 +gfx938,int8_w8a8_block,torch.float16,65536,128,7168,256,8,0,0,asm,13001+23000,30905.7893 +gfx938,int8_w8a8_block,torch.float16,1,256,7168,256,8,0,0,asm,10007+20000,77.6185 +gfx938,int8_w8a8_block,torch.float16,2,256,7168,256,8,0,0,asm,10002+20000,105.9251 +gfx938,int8_w8a8_block,torch.float16,3,256,7168,256,8,0,0,asm,10005+20000,140.3853 +gfx938,int8_w8a8_block,torch.float16,4,256,7168,256,8,0,0,asm,10006+20000,168.6228 +gfx938,int8_w8a8_block,torch.float16,5,256,7168,256,8,0,0,asm,10007+20000,214.0556 +gfx938,int8_w8a8_block,torch.float16,6,256,7168,256,8,0,0,asm,10002+20000,230.3543 +gfx938,int8_w8a8_block,torch.float16,7,256,7168,256,8,0,0,asm,10002+20000,238.9969 +gfx938,int8_w8a8_block,torch.float16,8,256,7168,256,8,0,0,asm,10006+20000,251.8997 +gfx938,int8_w8a8_block,torch.float16,9,256,7168,256,8,0,0,asm,10006+20000,267.6234 +gfx938,int8_w8a8_block,torch.float16,10,256,7168,256,8,0,0,asm,10006+20000,285.4415 +gfx938,int8_w8a8_block,torch.float16,11,256,7168,256,8,0,0,asm,10002+20000,333.1649 +gfx938,int8_w8a8_block,torch.float16,12,256,7168,256,8,0,0,asm,10002+20000,344.0761 +gfx938,int8_w8a8_block,torch.float16,13,256,7168,256,8,0,0,asm,10001+20000,349.9011 +gfx938,int8_w8a8_block,torch.float16,14,256,7168,256,8,0,0,asm,10006+20000,365.0726 +gfx938,int8_w8a8_block,torch.float16,15,256,7168,256,8,0,0,asm,10006+20000,368.4375 +gfx938,int8_w8a8_block,torch.float16,16,256,7168,256,8,0,0,asm,10006+20000,377.921 +gfx938,int8_w8a8_block,torch.float16,17,256,7168,256,8,0,0,asm,10005+20000,382.6404 +gfx938,int8_w8a8_block,torch.float16,18,256,7168,256,8,0,0,asm,10005+20000,399.2173 +gfx938,int8_w8a8_block,torch.float16,20,256,7168,256,8,0,0,asm,10006+20000,449.5929 +gfx938,int8_w8a8_block,torch.float16,24,256,7168,256,8,0,0,asm,10006+20000,474.6303 +gfx938,int8_w8a8_block,torch.float16,28,256,7168,256,8,0,0,asm,10006+20000,569.4063 +gfx938,int8_w8a8_block,torch.float16,32,256,7168,256,8,0,0,asm,10006+20000,581.0038 +gfx938,int8_w8a8_block,torch.float16,34,256,7168,256,8,0,0,asm,10006+20000,573.6796 +gfx938,int8_w8a8_block,torch.float16,36,256,7168,256,8,0,0,asm,10006+20000,589.4773 +gfx938,int8_w8a8_block,torch.float16,40,256,7168,256,8,0,0,asm,10006+20000,648.7822 +gfx938,int8_w8a8_block,torch.float16,44,256,7168,256,8,0,0,asm,10006+20000,656.8404 +gfx938,int8_w8a8_block,torch.float16,48,256,7168,256,8,0,0,asm,10006+20000,668.0452 +gfx938,int8_w8a8_block,torch.float16,56,256,7168,256,8,0,0,asm,10006+20000,688.69 +gfx938,int8_w8a8_block,torch.float16,64,256,7168,256,8,0,0,asm,10006+20000,745.0293 +gfx938,int8_w8a8_block,torch.float16,68,256,7168,256,8,0,0,asm,10005+20000,796.2971 +gfx938,int8_w8a8_block,torch.float16,72,256,7168,256,8,0,0,asm,10006+20000,769.4408 +gfx938,int8_w8a8_block,torch.float16,80,256,7168,256,8,0,0,asm,10006+20000,793.9629 +gfx938,int8_w8a8_block,torch.float16,88,256,7168,256,8,0,0,asm,10006+20000,789.2854 +gfx938,int8_w8a8_block,torch.float16,96,256,7168,256,8,0,0,asm,10006+20000,784.1078 +gfx938,int8_w8a8_block,torch.float16,104,256,7168,256,8,0,0,asm,10006+20000,807.4015 +gfx938,int8_w8a8_block,torch.float16,112,256,7168,256,8,0,0,asm,10006+20000,808.7332 +gfx938,int8_w8a8_block,torch.float16,128,256,7168,256,8,0,0,asm,10006+20000,822.8422 +gfx938,int8_w8a8_block,torch.float16,144,256,7168,256,8,0,0,asm,10006+20000,864.7766 +gfx938,int8_w8a8_block,torch.float16,160,256,7168,256,8,0,0,asm,10006+20000,870.1813 +gfx938,int8_w8a8_block,torch.float16,192,256,7168,256,8,0,0,asm,10006+20000,880.6561 +gfx938,int8_w8a8_block,torch.float16,224,256,7168,256,8,0,0,asm,10006+20000,896.0501 +gfx938,int8_w8a8_block,torch.float16,256,256,7168,256,8,0,0,asm,10006+20000,903.3266 +gfx938,int8_w8a8_block,torch.float16,320,256,7168,256,8,0,0,asm,10006+20000,925.3824 +gfx938,int8_w8a8_block,torch.float16,384,256,7168,256,8,0,0,asm,10006+20000,944.2336 +gfx938,int8_w8a8_block,torch.float16,448,256,7168,256,8,0,0,asm,11010+21000,1002.260 +gfx938,int8_w8a8_block,torch.float16,512,256,7168,256,8,0,0,asm,11010+21000,1016.7371 +gfx938,int8_w8a8_block,torch.float16,576,256,7168,256,8,0,0,asm,11010+21000,1038.1561 +gfx938,int8_w8a8_block,torch.float16,640,256,7168,256,8,0,0,asm,11010+21000,1053.8022 +gfx938,int8_w8a8_block,torch.float16,704,256,7168,256,8,0,0,asm,11010+21000,1065.2371 +gfx938,int8_w8a8_block,torch.float16,768,256,7168,256,8,0,0,asm,11010+21000,1086.4834 +gfx938,int8_w8a8_block,torch.float16,832,256,7168,256,8,0,0,asm,11010+21000,1097.2841 +gfx938,int8_w8a8_block,torch.float16,896,256,7168,256,8,0,0,asm,11010+21000,1173.5599 +gfx938,int8_w8a8_block,torch.float16,960,256,7168,256,8,0,0,asm,11010+21000,1268.5115 +gfx938,int8_w8a8_block,torch.float16,1024,256,7168,256,8,0,0,asm,11010+21000,1379.5732 +gfx938,int8_w8a8_block,torch.float16,1152,256,7168,256,8,0,0,asm,12003+22000,1498.5623 +gfx938,int8_w8a8_block,torch.float16,1280,256,7168,256,8,0,0,asm,12003+22000,1532.3256 +gfx938,int8_w8a8_block,torch.float16,1408,256,7168,256,8,0,0,asm,12003+22000,1544.3759 +gfx938,int8_w8a8_block,torch.float16,1536,256,7168,256,8,0,0,asm,12003+22000,1583.7134 +gfx938,int8_w8a8_block,torch.float16,1664,256,7168,256,8,0,0,asm,12003+22000,1621.6496 +gfx938,int8_w8a8_block,torch.float16,1792,256,7168,256,8,0,0,asm,12003+22000,1779.0828 +gfx938,int8_w8a8_block,torch.float16,1920,256,7168,256,8,0,0,asm,12003+22000,1871.3132 +gfx938,int8_w8a8_block,torch.float16,2048,256,7168,256,8,0,0,asm,12003+22000,2157.311 +gfx938,int8_w8a8_block,torch.float16,2304,256,7168,256,8,0,0,asm,13001+23000,2490.01 +gfx938,int8_w8a8_block,torch.float16,2560,256,7168,256,8,0,0,asm,13001+23000,2534.3288 +gfx938,int8_w8a8_block,torch.float16,2816,256,7168,256,8,0,0,asm,13001+23000,2579.0659 +gfx938,int8_w8a8_block,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23000,2653.1717 +gfx938,int8_w8a8_block,torch.float16,3328,256,7168,256,8,0,0,asm,13001+23000,2745.7236 +gfx938,int8_w8a8_block,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23000,2851.2956 +gfx938,int8_w8a8_block,torch.float16,3840,256,7168,256,8,0,0,asm,13001+23000,3127.3154 +gfx938,int8_w8a8_block,torch.float16,4096,256,7168,256,8,0,0,asm,13001+23000,3611.17 +gfx938,int8_w8a8_block,torch.float16,4608,256,7168,256,8,0,0,asm,12003+22000,4208.9501 +gfx938,int8_w8a8_block,torch.float16,5120,256,7168,256,8,0,0,asm,12003+22000,4453.8108 +gfx938,int8_w8a8_block,torch.float16,5632,256,7168,256,8,0,0,asm,12003+22000,4666.0811 +gfx938,int8_w8a8_block,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23000,4965.1708 +gfx938,int8_w8a8_block,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23000,5061.4082 +gfx938,int8_w8a8_block,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23000,5175.7386 +gfx938,int8_w8a8_block,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23000,5490.6364 +gfx938,int8_w8a8_block,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23000,6144.4436 +gfx938,int8_w8a8_block,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23000,7471.2098 +gfx938,int8_w8a8_block,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23000,8680.1129 +gfx938,int8_w8a8_block,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23000,10012.1071 +gfx938,int8_w8a8_block,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23000,11238.8407 +gfx938,int8_w8a8_block,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23000,12274.2076 +gfx938,int8_w8a8_block,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23000,16393.0752 +gfx938,int8_w8a8_block,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23000,21424.9168 +gfx938,int8_w8a8_block,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23000,25835.4239 +gfx938,int8_w8a8_block,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23000,30968.2254 +gfx938,int8_w8a8_block,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23000,36235.786 +gfx938,int8_w8a8_block,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23000,41438.8945 +gfx938,f8_w8a8_block,torch.float16,1,128,7168,256,8,0,0,asm,10008+20100,66.5653 +gfx938,f8_w8a8_block,torch.float16,2,128,7168,256,8,0,0,asm,10007+20101,75.3486 +gfx938,f8_w8a8_block,torch.float16,3,128,7168,256,8,0,0,asm,10002+20101,87.5783 +gfx938,f8_w8a8_block,torch.float16,4,128,7168,256,8,0,0,asm,10002+20101,101.6294 +gfx938,f8_w8a8_block,torch.float16,5,128,7168,256,8,0,0,asm,10006+20101,130.0846 +gfx938,f8_w8a8_block,torch.float16,6,128,7168,256,8,0,0,asm,10005+20101,131.9144 +gfx938,f8_w8a8_block,torch.float16,7,128,7168,256,8,0,0,asm,10006+20101,139.9513 +gfx938,f8_w8a8_block,torch.float16,8,128,7168,256,8,0,0,asm,10006+20101,142.8436 +gfx938,f8_w8a8_block,torch.float16,9,128,7168,256,8,0,0,asm,10006+20101,160.0702 +gfx938,f8_w8a8_block,torch.float16,10,128,7168,256,8,0,0,asm,10006+20101,176.2466 +gfx938,f8_w8a8_block,torch.float16,11,128,7168,256,8,0,0,asm,10007+20101,212.2792 +gfx938,f8_w8a8_block,torch.float16,12,128,7168,256,8,0,0,asm,10007+20101,215.5613 +gfx938,f8_w8a8_block,torch.float16,13,128,7168,256,8,0,0,asm,10007+20101,218.7449 +gfx938,f8_w8a8_block,torch.float16,14,128,7168,256,8,0,0,asm,10001+20101,223.9592 +gfx938,f8_w8a8_block,torch.float16,15,128,7168,256,8,0,0,asm,10002+20101,225.4525 +gfx938,f8_w8a8_block,torch.float16,16,128,7168,256,8,0,0,asm,10002+20101,233.5714 +gfx938,f8_w8a8_block,torch.float16,17,128,7168,256,8,0,0,asm,10002+20101,228.9449 +gfx938,f8_w8a8_block,torch.float16,18,128,7168,256,8,0,0,asm,11006+21101,244.153 +gfx938,f8_w8a8_block,torch.float16,20,128,7168,256,8,0,0,asm,10006+20101,259.0782 +gfx938,f8_w8a8_block,torch.float16,24,128,7168,256,8,0,0,asm,10006+20101,281.4083 +gfx938,f8_w8a8_block,torch.float16,28,128,7168,256,8,0,0,asm,10007+20101,353.9607 +gfx938,f8_w8a8_block,torch.float16,32,128,7168,256,8,0,0,asm,10002+20101,363.335 +gfx938,f8_w8a8_block,torch.float16,34,128,7168,256,8,0,0,asm,10002+20101,352.4438 +gfx938,f8_w8a8_block,torch.float16,36,128,7168,256,8,0,0,asm,10002+20101,360.4028 +gfx938,f8_w8a8_block,torch.float16,40,128,7168,256,8,0,0,asm,10006+20101,377.2437 +gfx938,f8_w8a8_block,torch.float16,44,128,7168,256,8,0,0,asm,10006+20101,381.695 +gfx938,f8_w8a8_block,torch.float16,48,128,7168,256,8,0,0,asm,10006+20101,387.1144 +gfx938,f8_w8a8_block,torch.float16,56,128,7168,256,8,0,0,asm,10006+20101,403.1349 +gfx938,f8_w8a8_block,torch.float16,64,128,7168,256,8,0,0,asm,10001+20101,451.6721 +gfx938,f8_w8a8_block,torch.float16,68,128,7168,256,8,0,0,asm,10007+20101,469.6886 +gfx938,f8_w8a8_block,torch.float16,72,128,7168,256,8,0,0,asm,10007+20101,473.0217 +gfx938,f8_w8a8_block,torch.float16,80,128,7168,256,8,0,0,asm,10007+20101,476.11 +gfx938,f8_w8a8_block,torch.float16,88,128,7168,256,8,0,0,asm,11006+21101,488.6581 +gfx938,f8_w8a8_block,torch.float16,96,128,7168,256,8,0,0,asm,10002+20101,487.6638 +gfx938,f8_w8a8_block,torch.float16,104,128,7168,256,8,0,0,asm,10006+20101,493.3294 +gfx938,f8_w8a8_block,torch.float16,112,128,7168,256,8,0,0,asm,10006+20101,494.3222 +gfx938,f8_w8a8_block,torch.float16,128,128,7168,256,8,0,0,asm,10006+20101,498.2566 +gfx938,f8_w8a8_block,torch.float16,144,128,7168,256,8,0,0,asm,10006+20101,520.7139 +gfx938,f8_w8a8_block,torch.float16,160,128,7168,256,8,0,0,asm,10006+20101,517.3169 +gfx938,f8_w8a8_block,torch.float16,192,128,7168,256,8,0,0,asm,10006+20101,525.3252 +gfx938,f8_w8a8_block,torch.float16,224,128,7168,256,8,0,0,asm,10006+20101,531.1508 +gfx938,f8_w8a8_block,torch.float16,256,128,7168,256,8,0,0,asm,10006+20101,539.0933 +gfx938,f8_w8a8_block,torch.float16,320,128,7168,256,8,0,0,asm,10006+20101,546.1825 +gfx938,f8_w8a8_block,torch.float16,384,128,7168,256,8,0,0,asm,10006+20101,555.8809 +gfx938,f8_w8a8_block,torch.float16,448,128,7168,256,8,0,0,asm,10006+20101,604.7094 +gfx938,f8_w8a8_block,torch.float16,512,128,7168,256,8,0,0,asm,11010+21101,625.7268 +gfx938,f8_w8a8_block,torch.float16,576,128,7168,256,8,0,0,asm,11010+21101,636.9596 +gfx938,f8_w8a8_block,torch.float16,640,128,7168,256,8,0,0,asm,11010+21101,649.7718 +gfx938,f8_w8a8_block,torch.float16,704,128,7168,256,8,0,0,asm,11010+21101,657.3738 +gfx938,f8_w8a8_block,torch.float16,768,128,7168,256,8,0,0,asm,11010+21101,664.6107 +gfx938,f8_w8a8_block,torch.float16,832,128,7168,256,8,0,0,asm,11010+21101,670.8138 +gfx938,f8_w8a8_block,torch.float16,896,128,7168,256,8,0,0,asm,11010+21101,737.3488 +gfx938,f8_w8a8_block,torch.float16,960,128,7168,256,8,0,0,asm,11010+21101,748.9636 +gfx938,f8_w8a8_block,torch.float16,1024,128,7168,256,8,0,0,asm,11010+21101,825.1375 +gfx938,f8_w8a8_block,torch.float16,1152,128,7168,256,8,0,0,asm,11010+21101,941.7131 +gfx938,f8_w8a8_block,torch.float16,1280,128,7168,256,8,0,0,asm,11010+21101,981.3457 +gfx938,f8_w8a8_block,torch.float16,1408,128,7168,256,8,0,0,asm,11010+21101,1017.7776 +gfx938,f8_w8a8_block,torch.float16,1536,128,7168,256,8,0,0,asm,12003+22000,1084.6548 +gfx938,f8_w8a8_block,torch.float16,1664,128,7168,256,8,0,0,asm,12003+22000,1110.2295 +gfx938,f8_w8a8_block,torch.float16,1792,128,7168,256,8,0,0,asm,11010+21101,1184.1491 +gfx938,f8_w8a8_block,torch.float16,1920,128,7168,256,8,0,0,asm,11010+21101,1268.0833 +gfx938,f8_w8a8_block,torch.float16,2048,128,7168,256,8,0,0,asm,11010+21101,1327.7161 +gfx938,f8_w8a8_block,torch.float16,2304,128,7168,256,8,0,0,asm,11010+21101,1537.3458 +gfx938,f8_w8a8_block,torch.float16,2560,128,7168,256,8,0,0,asm,11010+21101,1623.1556 +gfx938,f8_w8a8_block,torch.float16,2816,128,7168,256,8,0,0,asm,11010+21101,1743.6874 +gfx938,f8_w8a8_block,torch.float16,3072,128,7168,256,8,0,0,asm,13001+23000,1801.0634 +gfx938,f8_w8a8_block,torch.float16,3328,128,7168,256,8,0,0,asm,13001+23000,1867.9686 +gfx938,f8_w8a8_block,torch.float16,3584,128,7168,256,8,0,0,asm,13001+23000,1962.0926 +gfx938,f8_w8a8_block,torch.float16,3840,128,7168,256,8,0,0,asm,13001+23000,2202.7211 +gfx938,f8_w8a8_block,torch.float16,4096,128,7168,256,8,0,0,asm,11010+21101,2395.3579 +gfx938,f8_w8a8_block,torch.float16,4608,128,7168,256,8,0,0,asm,11010+21101,2679.7118 +gfx938,f8_w8a8_block,torch.float16,5120,128,7168,256,8,0,0,asm,11010+21101,2954.1683 +gfx938,f8_w8a8_block,torch.float16,5632,128,7168,256,8,0,0,asm,12003+22000,3219.4688 +gfx938,f8_w8a8_block,torch.float16,6144,128,7168,256,8,0,0,asm,14001+24000,3479.9218 +gfx938,f8_w8a8_block,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23000,3570.5668 +gfx938,f8_w8a8_block,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23000,3670.2289 +gfx938,f8_w8a8_block,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23000,3839.9203 +gfx938,f8_w8a8_block,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23000,4282.3809 +gfx938,f8_w8a8_block,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23000,5219.3544 +gfx938,f8_w8a8_block,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23000,6140.1191 +gfx938,f8_w8a8_block,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23000,7078.5975 +gfx938,f8_w8a8_block,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23000,7927.4133 +gfx938,f8_w8a8_block,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23000,8632.191 +gfx938,f8_w8a8_block,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23000,11640.6421 +gfx938,f8_w8a8_block,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23000,15245.6187 +gfx938,f8_w8a8_block,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15088.5354 +gfx938,f8_w8a8_block,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23000,21967.1596 +gfx938,f8_w8a8_block,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23000,25747.9781 +gfx938,f8_w8a8_block,torch.float16,65536,128,7168,256,8,0,0,asm,13001+23000,29469.4784 +gfx938,f8_w8a8_block,torch.float16,1,256,7168,256,8,0,0,asm,10007+20200,77.7863 +gfx938,f8_w8a8_block,torch.float16,2,256,7168,256,8,0,0,asm,10002+20000,108.981 +gfx938,f8_w8a8_block,torch.float16,3,256,7168,256,8,0,0,asm,10006+20000,140.2978 +gfx938,f8_w8a8_block,torch.float16,4,256,7168,256,8,0,0,asm,10006+20000,166.45 +gfx938,f8_w8a8_block,torch.float16,5,256,7168,256,8,0,0,asm,10007+20000,212.665 +gfx938,f8_w8a8_block,torch.float16,6,256,7168,256,8,0,0,asm,10002+20000,222.8202 +gfx938,f8_w8a8_block,torch.float16,7,256,7168,256,8,0,0,asm,10001+20000,239.1831 +gfx938,f8_w8a8_block,torch.float16,8,256,7168,256,8,0,0,asm,10006+20000,252.5874 +gfx938,f8_w8a8_block,torch.float16,9,256,7168,256,8,0,0,asm,10006+20000,265.8496 +gfx938,f8_w8a8_block,torch.float16,10,256,7168,256,8,0,0,asm,10006+20000,284.6495 +gfx938,f8_w8a8_block,torch.float16,11,256,7168,256,8,0,0,asm,10002+20000,331.5774 +gfx938,f8_w8a8_block,torch.float16,12,256,7168,256,8,0,0,asm,10002+20000,338.8226 +gfx938,f8_w8a8_block,torch.float16,13,256,7168,256,8,0,0,asm,10001+20000,348.3648 +gfx938,f8_w8a8_block,torch.float16,14,256,7168,256,8,0,0,asm,10006+20000,362.2937 +gfx938,f8_w8a8_block,torch.float16,15,256,7168,256,8,0,0,asm,10006+20000,370.0803 +gfx938,f8_w8a8_block,torch.float16,16,256,7168,256,8,0,0,asm,10006+20000,374.9336 +gfx938,f8_w8a8_block,torch.float16,17,256,7168,256,8,0,0,asm,10006+20000,380.9883 +gfx938,f8_w8a8_block,torch.float16,18,256,7168,256,8,0,0,asm,10006+20000,395.6061 +gfx938,f8_w8a8_block,torch.float16,20,256,7168,256,8,0,0,asm,10006+20000,451.4192 +gfx938,f8_w8a8_block,torch.float16,24,256,7168,256,8,0,0,asm,10006+20000,475.0546 +gfx938,f8_w8a8_block,torch.float16,28,256,7168,256,8,0,0,asm,10006+20000,567.282 +gfx938,f8_w8a8_block,torch.float16,32,256,7168,256,8,0,0,asm,10006+20000,578.7283 +gfx938,f8_w8a8_block,torch.float16,34,256,7168,256,8,0,0,asm,10006+20000,575.8412 +gfx938,f8_w8a8_block,torch.float16,36,256,7168,256,8,0,0,asm,10006+20000,589.5386 +gfx938,f8_w8a8_block,torch.float16,40,256,7168,256,8,0,0,asm,10006+20000,648.6628 +gfx938,f8_w8a8_block,torch.float16,44,256,7168,256,8,0,0,asm,10006+20000,658.6983 +gfx938,f8_w8a8_block,torch.float16,48,256,7168,256,8,0,0,asm,10006+20000,665.4139 +gfx938,f8_w8a8_block,torch.float16,56,256,7168,256,8,0,0,asm,10006+20000,684.4388 +gfx938,f8_w8a8_block,torch.float16,64,256,7168,256,8,0,0,asm,10006+20000,741.6224 +gfx938,f8_w8a8_block,torch.float16,68,256,7168,256,8,0,0,asm,10006+20000,751.4839 +gfx938,f8_w8a8_block,torch.float16,72,256,7168,256,8,0,0,asm,10004+20000,767.0313 +gfx938,f8_w8a8_block,torch.float16,80,256,7168,256,8,0,0,asm,11010+21000,811.3511 +gfx938,f8_w8a8_block,torch.float16,88,256,7168,256,8,0,0,asm,10006+20000,790.4088 +gfx938,f8_w8a8_block,torch.float16,96,256,7168,256,8,0,0,asm,10006+20000,783.8489 +gfx938,f8_w8a8_block,torch.float16,104,256,7168,256,8,0,0,asm,10006+20000,808.2379 +gfx938,f8_w8a8_block,torch.float16,112,256,7168,256,8,0,0,asm,10006+20000,811.6932 +gfx938,f8_w8a8_block,torch.float16,128,256,7168,256,8,0,0,asm,10006+20000,822.2664 +gfx938,f8_w8a8_block,torch.float16,144,256,7168,256,8,0,0,asm,10006+20000,864.5257 +gfx938,f8_w8a8_block,torch.float16,160,256,7168,256,8,0,0,asm,10006+20000,870.8529 +gfx938,f8_w8a8_block,torch.float16,192,256,7168,256,8,0,0,asm,10006+20000,882.0707 +gfx938,f8_w8a8_block,torch.float16,224,256,7168,256,8,0,0,asm,10006+20000,896.5011 +gfx938,f8_w8a8_block,torch.float16,256,256,7168,256,8,0,0,asm,10006+20000,905.1327 +gfx938,f8_w8a8_block,torch.float16,320,256,7168,256,8,0,0,asm,10006+20000,923.0352 +gfx938,f8_w8a8_block,torch.float16,384,256,7168,256,8,0,0,asm,10006+20000,943.2565 +gfx938,f8_w8a8_block,torch.float16,448,256,7168,256,8,0,0,asm,11010+21000,988.6851 +gfx938,f8_w8a8_block,torch.float16,512,256,7168,256,8,0,0,asm,11010+21000,1011.6567 +gfx938,f8_w8a8_block,torch.float16,576,256,7168,256,8,0,0,asm,11010+21000,1028.3101 +gfx938,f8_w8a8_block,torch.float16,640,256,7168,256,8,0,0,asm,11010+21000,1040.4231 +gfx938,f8_w8a8_block,torch.float16,704,256,7168,256,8,0,0,asm,11010+21000,1062.4813 +gfx938,f8_w8a8_block,torch.float16,768,256,7168,256,8,0,0,asm,11010+21000,1077.4394 +gfx938,f8_w8a8_block,torch.float16,832,256,7168,256,8,0,0,asm,11010+21000,1089.0687 +gfx938,f8_w8a8_block,torch.float16,896,256,7168,256,8,0,0,asm,11010+21000,1155.8735 +gfx938,f8_w8a8_block,torch.float16,960,256,7168,256,8,0,0,asm,11010+21000,1248.7023 +gfx938,f8_w8a8_block,torch.float16,1024,256,7168,256,8,0,0,asm,11010+21000,1354.4664 +gfx938,f8_w8a8_block,torch.float16,1152,256,7168,256,8,0,0,asm,12003+22000,1419.539 +gfx938,f8_w8a8_block,torch.float16,1280,256,7168,256,8,0,0,asm,12003+22000,1460.1541 +gfx938,f8_w8a8_block,torch.float16,1408,256,7168,256,8,0,0,asm,12003+22000,1472.3131 +gfx938,f8_w8a8_block,torch.float16,1536,256,7168,256,8,0,0,asm,12003+22000,1503.9613 +gfx938,f8_w8a8_block,torch.float16,1664,256,7168,256,8,0,0,asm,12003+22000,1542.3411 +gfx938,f8_w8a8_block,torch.float16,1792,256,7168,256,8,0,0,asm,12003+22000,1692.1873 +gfx938,f8_w8a8_block,torch.float16,1920,256,7168,256,8,0,0,asm,12003+22000,1780.018 +gfx938,f8_w8a8_block,torch.float16,2048,256,7168,256,8,0,0,asm,12003+22000,2044.5118 +gfx938,f8_w8a8_block,torch.float16,2304,256,7168,256,8,0,0,asm,13001+23000,2335.2539 +gfx938,f8_w8a8_block,torch.float16,2560,256,7168,256,8,0,0,asm,13001+23000,2437.9238 +gfx938,f8_w8a8_block,torch.float16,2816,256,7168,256,8,0,0,asm,13001+23000,2511.6265 +gfx938,f8_w8a8_block,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23000,2562.9169 +gfx938,f8_w8a8_block,torch.float16,3328,256,7168,256,8,0,0,asm,13001+23000,2649.1537 +gfx938,f8_w8a8_block,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23000,2733.0184 +gfx938,f8_w8a8_block,torch.float16,3840,256,7168,256,8,0,0,asm,13001+23000,2959.8084 +gfx938,f8_w8a8_block,torch.float16,4096,256,7168,256,8,0,0,asm,13001+23000,3382.3163 +gfx938,f8_w8a8_block,torch.float16,4608,256,7168,256,8,0,0,asm,12003+22000,3998.149 +gfx938,f8_w8a8_block,torch.float16,5120,256,7168,256,8,0,0,asm,12003+22000,4229.2055 +gfx938,f8_w8a8_block,torch.float16,5632,256,7168,256,8,0,0,asm,12003+22000,4436.4858 +gfx938,f8_w8a8_block,torch.float16,6144,256,7168,256,8,0,0,asm,13001+23000,4671.2596 +gfx938,f8_w8a8_block,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23000,4781.0715 +gfx938,f8_w8a8_block,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23000,4890.9133 +gfx938,f8_w8a8_block,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23000,5179.0355 +gfx938,f8_w8a8_block,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23000,5775.563 +gfx938,f8_w8a8_block,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23000,7044.6425 +gfx938,f8_w8a8_block,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23000,8183.457 +gfx938,f8_w8a8_block,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23000,9436.2243 +gfx938,f8_w8a8_block,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23000,10631.6376 +gfx938,f8_w8a8_block,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23000,11551.5316 +gfx938,f8_w8a8_block,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23000,15473.3602 +gfx938,f8_w8a8_block,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23000,20238.4598 +gfx938,f8_w8a8_block,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23000,24225.5718 +gfx938,f8_w8a8_block,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23000,29117.2928 +gfx938,f8_w8a8_block,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23000,34120.5525 +gfx938,f8_w8a8_block,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23000,39073.9223 +gfx938,f8_w8a8_block,torch.float16,1,256,7168,257,9,0,0,asm,10007+20000,76.1697 +gfx938,f8_w8a8_block,torch.float16,2,256,7168,257,9,0,0,asm,10002+20000,106.8558 +gfx938,f8_w8a8_block,torch.float16,4,256,7168,257,9,0,0,asm,10006+20000,162.4684 +gfx938,f8_w8a8_block,torch.float16,6,256,7168,257,9,0,0,asm,10002+20000,219.2933 +gfx938,f8_w8a8_block,torch.float16,8,256,7168,257,9,0,0,asm,10006+20000,263.1334 +gfx938,f8_w8a8_block,torch.float16,12,256,7168,257,9,0,0,asm,10001+20000,357.1709 +gfx938,f8_w8a8_block,torch.float16,16,256,7168,257,9,0,0,asm,10002+20000,446.2233 +gfx938,f8_w8a8_block,torch.float16,20,256,7168,257,9,0,0,asm,10006+20200,499.2896 +gfx938,f8_w8a8_block,torch.float16,24,256,7168,257,9,0,0,asm,10002+20200,569.8494 +gfx938,f8_w8a8_block,torch.float16,28,256,7168,257,9,0,0,asm,10002+20200,620.7912 +gfx938,f8_w8a8_block,torch.float16,32,256,7168,257,9,0,0,asm,10006+20200,634.5932 +gfx938,f8_w8a8_block,torch.float16,36,256,7168,257,9,0,0,asm,10002+20200,683.0027 +gfx938,f8_w8a8_block,torch.float16,40,256,7168,257,9,0,0,asm,10002+20200,697.5795 +gfx938,f8_w8a8_block,torch.float16,44,256,7168,257,9,0,0,asm,10002+20200,710.9268 +gfx938,f8_w8a8_block,torch.float16,48,256,7168,257,9,0,0,asm,10006+20200,739.6425 +gfx938,f8_w8a8_block,torch.float16,56,256,7168,257,9,0,0,asm,10002+20200,808.7286 +gfx938,f8_w8a8_block,torch.float16,64,256,7168,257,9,0,0,asm,10002+20200,819.6086 +gfx938,f8_w8a8_block,torch.float16,72,256,7168,257,9,0,0,asm,10006+20000,842.918 +gfx938,f8_w8a8_block,torch.float16,80,256,7168,257,9,0,0,asm,10006+20000,853.7896 +gfx938,f8_w8a8_block,torch.float16,96,256,7168,257,9,0,0,asm,10006+20000,883.76 +gfx938,f8_w8a8_block,torch.float16,104,256,7168,257,9,0,0,asm,10006+20000,887.9284 +gfx938,f8_w8a8_block,torch.float16,112,256,7168,257,9,0,0,asm,10002+20000,904.5767 +gfx938,f8_w8a8_block,torch.float16,128,256,7168,257,9,0,0,asm,10002+20000,924.9725 +gfx938,f8_w8a8_block,torch.float16,144,256,7168,257,9,0,0,asm,10002+20200,932.5766 +gfx938,f8_w8a8_block,torch.float16,160,256,7168,257,9,0,0,asm,10002+20200,939.3472 +gfx938,f8_w8a8_block,torch.float16,192,256,7168,257,9,0,0,asm,10002+20200,951.3051 +gfx938,f8_w8a8_block,torch.float16,224,256,7168,257,9,0,0,asm,10002+20200,957.4777 +gfx938,f8_w8a8_block,torch.float16,256,256,7168,257,9,0,0,asm,10002+20200,967.1197 +gfx938,f8_w8a8_block,torch.float16,320,256,7168,257,9,0,0,asm,10006+20200,974.9429 +gfx938,f8_w8a8_block,torch.float16,384,256,7168,257,9,0,0,asm,10006+20200,1016.8207 +gfx938,f8_w8a8_block,torch.float16,448,256,7168,257,9,0,0,asm,11010+21200,1045.5112 +gfx938,f8_w8a8_block,torch.float16,512,256,7168,257,9,0,0,asm,11010+21000,1080.2647 +gfx938,f8_w8a8_block,torch.float16,768,256,7168,257,9,0,0,asm,11010+21000,1161.0055 +gfx938,f8_w8a8_block,torch.float16,896,256,7168,257,9,0,0,asm,11010+21200,1271.7673 +gfx938,f8_w8a8_block,torch.float16,960,256,7168,257,9,0,0,asm,12003+22000,1399.2027 +gfx938,f8_w8a8_block,torch.float16,1024,256,7168,257,9,0,0,asm,12003+22000,1453.5352 +gfx938,f8_w8a8_block,torch.float16,1280,256,7168,257,9,0,0,asm,12003+22000,1460.8194 +gfx938,f8_w8a8_block,torch.float16,1536,256,7168,257,9,0,0,asm,12003+22000,1538.6213 +gfx938,f8_w8a8_block,torch.float16,1920,256,7168,257,9,0,0,asm,12003+22000,2139.9837 +gfx938,f8_w8a8_block,torch.float16,2048,256,7168,257,9,0,0,asm,12003+22000,2323.3095 +gfx938,f8_w8a8_block,torch.float16,2304,256,7168,257,9,0,0,asm,13001+23000,2408.0251 +gfx938,f8_w8a8_block,torch.float16,2560,256,7168,257,9,0,0,asm,13001+23000,2462.8459 +gfx938,f8_w8a8_block,torch.float16,3072,256,7168,257,9,0,0,asm,13001+23000,2576.1508 +gfx938,f8_w8a8_block,torch.float16,3584,256,7168,257,9,0,0,asm,13001+23000,3054.2211 +gfx938,f8_w8a8_block,torch.float16,3840,256,7168,257,9,0,0,asm,12005+22000,3551.4827 +gfx938,f8_w8a8_block,torch.float16,4096,256,7168,257,9,0,0,asm,12005+22000,3749.5706 +gfx938,f8_w8a8_block,torch.float16,4608,256,7168,257,9,0,0,asm,12005+22000,3981.5614 +gfx938,f8_w8a8_block,torch.float16,5120,256,7168,257,9,0,0,asm,12005+22000,4231.3544 +gfx938,f8_w8a8_block,torch.float16,6144,256,7168,257,9,0,0,asm,13001+23000,4610.5283 +gfx938,f8_w8a8_block,torch.float16,7168,256,7168,257,9,0,0,asm,13001+23000,5281.9533 +gfx938,f8_w8a8_block,torch.float16,8192,256,7168,257,9,0,0,asm,13001+23000,6427.1963 +gfx938,f8_w8a8_block,torch.float16,10240,256,7168,257,9,0,0,asm,13001+23000,7069.4428 +gfx938,f8_w8a8_block,torch.float16,12288,256,7168,257,9,0,0,asm,14001+24000,8706.9454 +gfx938,f8_w8a8_block,torch.float16,16384,256,7168,257,9,0,0,asm,13001+23000,11135.1487 +gfx938,f8_w8a8_block,torch.float16,24576,256,7168,257,9,0,0,asm,13001+23000,15944.6083 +gfx938,f8_w8a8_block,torch.float16,32768,256,7168,257,9,0,0,asm,13001+23000,21105.1667 +gfx938,f8_w8a8_block,torch.float16,1,256,6144,256,8,0,0,asm,10007+20000,65.1802 +gfx938,f8_w8a8_block,torch.float16,2,256,6144,256,8,0,0,asm,10002+20000,89.8117 +gfx938,f8_w8a8_block,torch.float16,4,256,6144,256,8,0,0,asm,10006+20000,134.7546 +gfx938,f8_w8a8_block,torch.float16,6,256,6144,256,8,0,0,asm,10002+20000,188.1102 +gfx938,f8_w8a8_block,torch.float16,8,256,6144,256,8,0,0,asm,10006+20000,214.1985 +gfx938,f8_w8a8_block,torch.float16,12,256,6144,256,8,0,0,asm,10002+20000,292.6573 +gfx938,f8_w8a8_block,torch.float16,16,256,6144,256,8,0,0,asm,10006+20000,334.1645 +gfx938,f8_w8a8_block,torch.float16,20,256,6144,256,8,0,0,asm,10002+20000,400.2413 +gfx938,f8_w8a8_block,torch.float16,24,256,6144,256,8,0,0,asm,10006+20000,440.0895 +gfx938,f8_w8a8_block,torch.float16,28,256,6144,256,8,0,0,asm,10002+20000,515.9659 +gfx938,f8_w8a8_block,torch.float16,32,256,6144,256,8,0,0,asm,10006+20200,536.2101 +gfx938,f8_w8a8_block,torch.float16,36,256,6144,256,8,0,0,asm,10006+20200,551.8786 +gfx938,f8_w8a8_block,torch.float16,40,256,6144,256,8,0,0,asm,10002+20200,592.7795 +gfx938,f8_w8a8_block,torch.float16,44,256,6144,256,8,0,0,asm,10002+20200,603.8279 +gfx938,f8_w8a8_block,torch.float16,48,256,6144,256,8,0,0,asm,10006+20200,622.2699 +gfx938,f8_w8a8_block,torch.float16,56,256,6144,256,8,0,0,asm,10006+20000,643.0614 +gfx938,f8_w8a8_block,torch.float16,64,256,6144,256,8,0,0,asm,10002+20200,678.5897 +gfx938,f8_w8a8_block,torch.float16,72,256,6144,256,8,0,0,asm,10002+20000,698.1938 +gfx938,f8_w8a8_block,torch.float16,80,256,6144,256,8,0,0,asm,10002+20000,710.8506 +gfx938,f8_w8a8_block,torch.float16,96,256,6144,256,8,0,0,asm,10006+20200,736.1979 +gfx938,f8_w8a8_block,torch.float16,104,256,6144,256,8,0,0,asm,10006+20200,743.8105 +gfx938,f8_w8a8_block,torch.float16,112,256,6144,256,8,0,0,asm,10006+20200,752.2147 +gfx938,f8_w8a8_block,torch.float16,128,256,6144,256,8,0,0,asm,10002+20000,762.1599 +gfx938,f8_w8a8_block,torch.float16,144,256,6144,256,8,0,0,asm,10002+20000,781.7557 +gfx938,f8_w8a8_block,torch.float16,160,256,6144,256,8,0,0,asm,10002+20200,796.2398 +gfx938,f8_w8a8_block,torch.float16,192,256,6144,256,8,0,0,asm,10002+20200,803.1787 +gfx938,f8_w8a8_block,torch.float16,224,256,6144,256,8,0,0,asm,10002+20200,809.4272 +gfx938,f8_w8a8_block,torch.float16,256,256,6144,256,8,0,0,asm,10002+20200,814.3955 +gfx938,f8_w8a8_block,torch.float16,320,256,6144,256,8,0,0,asm,10002+20200,832.5007 +gfx938,f8_w8a8_block,torch.float16,384,256,6144,256,8,0,0,asm,10006+20200,842.3365 +gfx938,f8_w8a8_block,torch.float16,448,256,6144,256,8,0,0,asm,10006+20000,883.2795 +gfx938,f8_w8a8_block,torch.float16,512,256,6144,256,8,0,0,asm,11009+21000,918.4626 +gfx938,f8_w8a8_block,torch.float16,768,256,6144,256,8,0,0,asm,11010+21200,988.1044 +gfx938,f8_w8a8_block,torch.float16,896,256,6144,256,8,0,0,asm,11010+21200,1011.2959 +gfx938,f8_w8a8_block,torch.float16,960,256,6144,256,8,0,0,asm,11010+21200,1078.3104 +gfx938,f8_w8a8_block,torch.float16,1024,256,6144,256,8,0,0,asm,11010+21000,1170.7396 +gfx938,f8_w8a8_block,torch.float16,1280,256,6144,256,8,0,0,asm,12003+22000,1247.3204 +gfx938,f8_w8a8_block,torch.float16,1536,256,6144,256,8,0,0,asm,12003+22000,1302.6044 +gfx938,f8_w8a8_block,torch.float16,1920,256,6144,256,8,0,0,asm,12003+22000,1482.9997 +gfx938,f8_w8a8_block,torch.float16,2048,256,6144,256,8,0,0,asm,12003+22000,1709.5082 +gfx938,f8_w8a8_block,torch.float16,2304,256,6144,256,8,0,0,asm,12003+22000,2042.7786 +gfx938,f8_w8a8_block,torch.float16,2560,256,6144,256,8,0,0,asm,13001+23000,2084.2437 +gfx938,f8_w8a8_block,torch.float16,3072,256,6144,256,8,0,0,asm,13001+23000,2171.3676 +gfx938,f8_w8a8_block,torch.float16,3584,256,6144,256,8,0,0,asm,13001+23000,2271.7631 +gfx938,f8_w8a8_block,torch.float16,3840,256,6144,256,8,0,0,asm,13001+23000,2448.1835 +gfx938,f8_w8a8_block,torch.float16,4096,256,6144,256,8,0,0,asm,13001+23000,2797.9086 +gfx938,f8_w8a8_block,torch.float16,4608,256,6144,256,8,0,0,asm,12005+22000,3283.8269 +gfx938,f8_w8a8_block,torch.float16,5120,256,6144,256,8,0,0,asm,12005+22000,3498.3187 +gfx938,f8_w8a8_block,torch.float16,6144,256,6144,256,8,0,0,asm,13001+23000,3881.1637 +gfx938,f8_w8a8_block,torch.float16,7168,256,6144,256,8,0,0,asm,13001+23000,4041.6432 +gfx938,f8_w8a8_block,torch.float16,8192,256,6144,256,8,0,0,asm,13001+23000,4767.1059 +gfx938,f8_w8a8_block,torch.float16,10240,256,6144,256,8,0,0,asm,13001+23000,5883.4809 +gfx938,f8_w8a8_block,torch.float16,12288,256,6144,256,8,0,0,asm,13001+23000,6685.2213 +gfx938,f8_w8a8_block,torch.float16,16384,256,6144,256,8,0,0,asm,13001+23000,8614.1997 +gfx938,f8_w8a8_block,torch.float16,24576,256,6144,256,8,0,0,asm,13001+23000,12547.1628 +gfx938,f8_w8a8_block,torch.float16,32768,256,6144,256,8,0,0,asm,13001+23000,16452.4205 +gfx938,f8_w8a8_block,torch.float16,1,256,6144,257,9,0,0,asm,10007+20000,71.3526 +gfx938,f8_w8a8_block,torch.float16,2,256,6144,257,9,0,0,asm,10002+20000,96.1274 +gfx938,f8_w8a8_block,torch.float16,4,256,6144,257,9,0,0,asm,10006+20000,142.9145 +gfx938,f8_w8a8_block,torch.float16,6,256,6144,257,9,0,0,asm,10002+20000,195.2598 +gfx938,f8_w8a8_block,torch.float16,8,256,6144,257,9,0,0,asm,10006+20000,227.8996 +gfx938,f8_w8a8_block,torch.float16,12,256,6144,257,9,0,0,asm,10002+20000,310.8635 +gfx938,f8_w8a8_block,torch.float16,16,256,6144,257,9,0,0,asm,10002+20000,395.5621 +gfx938,f8_w8a8_block,torch.float16,20,256,6144,257,9,0,0,asm,10006+20000,433.4706 +gfx938,f8_w8a8_block,torch.float16,24,256,6144,257,9,0,0,asm,10002+20200,498.9272 +gfx938,f8_w8a8_block,torch.float16,28,256,6144,257,9,0,0,asm,10005+20200,540.3953 +gfx938,f8_w8a8_block,torch.float16,32,256,6144,257,9,0,0,asm,10006+20200,549.3553 +gfx938,f8_w8a8_block,torch.float16,36,256,6144,257,9,0,0,asm,10002+20200,591.5079 +gfx938,f8_w8a8_block,torch.float16,40,256,6144,257,9,0,0,asm,10002+20200,603.2889 +gfx938,f8_w8a8_block,torch.float16,44,256,6144,257,9,0,0,asm,10002+20200,615.2552 +gfx938,f8_w8a8_block,torch.float16,48,256,6144,257,9,0,0,asm,10006+20000,638.8088 +gfx938,f8_w8a8_block,torch.float16,56,256,6144,257,9,0,0,asm,10002+20200,698.1096 +gfx938,f8_w8a8_block,torch.float16,64,256,6144,257,9,0,0,asm,10002+20200,706.2865 +gfx938,f8_w8a8_block,torch.float16,72,256,6144,257,9,0,0,asm,10006+20000,730.6401 +gfx938,f8_w8a8_block,torch.float16,80,256,6144,257,9,0,0,asm,10006+20000,741.5369 +gfx938,f8_w8a8_block,torch.float16,96,256,6144,257,9,0,0,asm,10002+20000,762.0168 +gfx938,f8_w8a8_block,torch.float16,104,256,6144,257,9,0,0,asm,10002+20000,775.5578 +gfx938,f8_w8a8_block,torch.float16,112,256,6144,257,9,0,0,asm,10002+20000,779.8609 +gfx938,f8_w8a8_block,torch.float16,128,256,6144,257,9,0,0,asm,10002+20200,790.1009 +gfx938,f8_w8a8_block,torch.float16,144,256,6144,257,9,0,0,asm,10002+20200,798.8672 +gfx938,f8_w8a8_block,torch.float16,160,256,6144,257,9,0,0,asm,10002+20000,806.9093 +gfx938,f8_w8a8_block,torch.float16,192,256,6144,257,9,0,0,asm,10001+20200,813.0229 +gfx938,f8_w8a8_block,torch.float16,224,256,6144,257,9,0,0,asm,10002+20200,822.6313 +gfx938,f8_w8a8_block,torch.float16,256,256,6144,257,9,0,0,asm,10001+20200,833.3765 +gfx938,f8_w8a8_block,torch.float16,320,256,6144,257,9,0,0,asm,10006+20200,849.2081 +gfx938,f8_w8a8_block,torch.float16,384,256,6144,257,9,0,0,asm,10006+20000,880.9048 +gfx938,f8_w8a8_block,torch.float16,448,256,6144,257,9,0,0,asm,11010+21200,913.6289 +gfx938,f8_w8a8_block,torch.float16,512,256,6144,257,9,0,0,asm,11010+21200,981.9486 +gfx938,f8_w8a8_block,torch.float16,768,256,6144,257,9,0,0,asm,11010+21000,979.4476 +gfx938,f8_w8a8_block,torch.float16,896,256,6144,257,9,0,0,asm,11010+21200,1150.9585 +gfx938,f8_w8a8_block,torch.float16,960,256,6144,257,9,0,0,asm,12003+22000,1219.0678 +gfx938,f8_w8a8_block,torch.float16,1024,256,6144,257,9,0,0,asm,12003+22000,1272.1708 +gfx938,f8_w8a8_block,torch.float16,1280,256,6144,257,9,0,0,asm,12003+22000,1270.6803 +gfx938,f8_w8a8_block,torch.float16,1536,256,6144,257,9,0,0,asm,12003+22000,1338.1916 +gfx938,f8_w8a8_block,torch.float16,1920,256,6144,257,9,0,0,asm,12003+22000,1876.4045 +gfx938,f8_w8a8_block,torch.float16,2048,256,6144,257,9,0,0,asm,12003+22000,2017.7345 +gfx938,f8_w8a8_block,torch.float16,2304,256,6144,257,9,0,0,asm,13001+23000,2088.9174 +gfx938,f8_w8a8_block,torch.float16,2560,256,6144,257,9,0,0,asm,13001+23000,2126.6857 +gfx938,f8_w8a8_block,torch.float16,3072,256,6144,257,9,0,0,asm,13001+23000,2241.6832 +gfx938,f8_w8a8_block,torch.float16,3584,256,6144,257,9,0,0,asm,13001+23000,2649.9259 +gfx938,f8_w8a8_block,torch.float16,3840,256,6144,257,9,0,0,asm,12005+22000,3052.4003 +gfx938,f8_w8a8_block,torch.float16,4096,256,6144,257,9,0,0,asm,12005+22000,3290.1763 +gfx938,f8_w8a8_block,torch.float16,4608,256,6144,257,9,0,0,asm,12005+22000,3497.8302 +gfx938,f8_w8a8_block,torch.float16,5120,256,6144,257,9,0,0,asm,12005+22000,3640.676 +gfx938,f8_w8a8_block,torch.float16,6144,256,6144,257,9,0,0,asm,13001+23000,3990.1233 +gfx938,f8_w8a8_block,torch.float16,7168,256,6144,257,9,0,0,asm,13001+23000,4562.9129 +gfx938,f8_w8a8_block,torch.float16,8192,256,6144,257,9,0,0,asm,13001+23000,5648.3743 +gfx938,f8_w8a8_block,torch.float16,10240,256,6144,257,9,0,0,asm,13001+23000,6141.1557 +gfx938,f8_w8a8_block,torch.float16,12288,256,6144,257,9,0,0,asm,14001+24000,7544.4518 +gfx938,f8_w8a8_block,torch.float16,16384,256,6144,257,9,0,0,asm,13001+23000,9580.1329 +gfx938,f8_w8a8_block,torch.float16,24576,256,6144,257,9,0,0,asm,13001+23000,13717.7858 +gfx938,f8_w8a8_block,torch.float16,32768,256,6144,257,9,0,0,asm,13001+23000,18186.7557 +gfx938,f8_w8a8_block,torch.float16,1,384,3072,256,8,0,0,asm,10002+20000,68.0432 +gfx938,f8_w8a8_block,torch.float16,2,384,3072,256,8,0,0,asm,10005+20000,81.6263 +gfx938,f8_w8a8_block,torch.float16,4,384,3072,256,8,0,0,asm,10001+20000,126.6955 +gfx938,f8_w8a8_block,torch.float16,6,384,3072,256,8,0,0,asm,10006+20000,149.3566 +gfx938,f8_w8a8_block,torch.float16,8,384,3072,256,8,0,0,asm,10006+20000,187.3859 +gfx938,f8_w8a8_block,torch.float16,12,384,3072,256,8,0,0,asm,10006+20000,246.4426 +gfx938,f8_w8a8_block,torch.float16,16,384,3072,256,8,0,0,asm,10006+20000,297.0613 +gfx938,f8_w8a8_block,torch.float16,20,384,3072,256,8,0,0,asm,10006+20000,322.1529 +gfx938,f8_w8a8_block,torch.float16,24,384,3072,256,8,0,0,asm,10006+20000,369.5548 +gfx938,f8_w8a8_block,torch.float16,28,384,3072,256,8,0,0,asm,10006+20000,425.1955 +gfx938,f8_w8a8_block,torch.float16,32,384,3072,256,8,0,0,asm,10006+20000,440.5555 +gfx938,f8_w8a8_block,torch.float16,36,384,3072,256,8,0,0,asm,10006+20000,467.4997 +gfx938,f8_w8a8_block,torch.float16,40,384,3072,256,8,0,0,asm,10006+20000,475.6513 +gfx938,f8_w8a8_block,torch.float16,44,384,3072,256,8,0,0,asm,10006+20000,507.3901 +gfx938,f8_w8a8_block,torch.float16,48,384,3072,256,8,0,0,asm,10006+20000,517.7227 +gfx938,f8_w8a8_block,torch.float16,56,384,3072,256,8,0,0,asm,10006+20000,531.4405 +gfx938,f8_w8a8_block,torch.float16,64,384,3072,256,8,0,0,asm,10006+20000,562.2109 +gfx938,f8_w8a8_block,torch.float16,72,384,3072,256,8,0,0,asm,10006+20000,571.7014 +gfx938,f8_w8a8_block,torch.float16,80,384,3072,256,8,0,0,asm,10006+20000,585.0571 +gfx938,f8_w8a8_block,torch.float16,96,384,3072,256,8,0,0,asm,10006+20000,617.6802 +gfx938,f8_w8a8_block,torch.float16,104,384,3072,256,8,0,0,asm,10006+20000,630.278 +gfx938,f8_w8a8_block,torch.float16,112,384,3072,256,8,0,0,asm,10006+20000,629.1917 +gfx938,f8_w8a8_block,torch.float16,128,384,3072,256,8,0,0,asm,10006+20000,632.2991 +gfx938,f8_w8a8_block,torch.float16,144,384,3072,256,8,0,0,asm,10006+20000,635.5074 +gfx938,f8_w8a8_block,torch.float16,160,384,3072,256,8,0,0,asm,10006+20000,640.2232 +gfx938,f8_w8a8_block,torch.float16,192,384,3072,256,8,0,0,asm,10006+20000,645.8653 +gfx938,f8_w8a8_block,torch.float16,224,384,3072,256,8,0,0,asm,10006+20000,646.1769 +gfx938,f8_w8a8_block,torch.float16,256,384,3072,256,8,0,0,asm,10006+20000,650.0422 +gfx938,f8_w8a8_block,torch.float16,320,384,3072,256,8,0,0,asm,10006+20000,661.3263 +gfx938,f8_w8a8_block,torch.float16,384,384,3072,256,8,0,0,asm,10006+20000,677.3178 +gfx938,f8_w8a8_block,torch.float16,448,384,3072,256,8,0,0,asm,11010+21000,687.4904 +gfx938,f8_w8a8_block,torch.float16,512,384,3072,256,8,0,0,asm,11010+21000,700.4841 +gfx938,f8_w8a8_block,torch.float16,768,384,3072,256,8,0,0,asm,11010+21000,779.8101 +gfx938,f8_w8a8_block,torch.float16,896,384,3072,256,8,0,0,asm,11010+21000,778.1343 +gfx938,f8_w8a8_block,torch.float16,960,384,3072,256,8,0,0,asm,11010+21000,830.3867 +gfx938,f8_w8a8_block,torch.float16,1024,384,3072,256,8,0,0,asm,12003+22000,844.1972 +gfx938,f8_w8a8_block,torch.float16,1280,384,3072,256,8,0,0,asm,12003+22000,868.9634 +gfx938,f8_w8a8_block,torch.float16,1536,384,3072,256,8,0,0,asm,12003+22000,966.8914 +gfx938,f8_w8a8_block,torch.float16,1920,384,3072,256,8,0,0,asm,12003+22000,1074.9331 +gfx938,f8_w8a8_block,torch.float16,2048,384,3072,256,8,0,0,asm,12003+22000,1186.2758 +gfx938,f8_w8a8_block,torch.float16,2304,384,3072,256,8,0,0,asm,13001+22000,1380.894 +gfx938,f8_w8a8_block,torch.float16,2560,384,3072,256,8,0,0,asm,13001+22000,1404.4476 +gfx938,f8_w8a8_block,torch.float16,3072,384,3072,256,8,0,0,asm,13001+22000,1453.7779 +gfx938,f8_w8a8_block,torch.float16,3584,384,3072,256,8,0,0,asm,13001+22000,1557.5754 +gfx938,f8_w8a8_block,torch.float16,3840,384,3072,256,8,0,0,asm,13001+23000,1709.8021 +gfx938,f8_w8a8_block,torch.float16,4096,384,3072,256,8,0,0,asm,12005+22000,1977.001 +gfx938,f8_w8a8_block,torch.float16,4608,384,3072,256,8,0,0,asm,12005+22000,2227.3243 +gfx938,f8_w8a8_block,torch.float16,5120,384,3072,256,8,0,0,asm,12005+22000,2342.4227 +gfx938,f8_w8a8_block,torch.float16,6144,384,3072,256,8,0,0,asm,13001+23000,2614.8596 +gfx938,f8_w8a8_block,torch.float16,7168,384,3072,256,8,0,0,asm,13001+23000,2704.6866 +gfx938,f8_w8a8_block,torch.float16,8192,384,3072,256,8,0,0,asm,13001+23000,3214.4025 +gfx938,f8_w8a8_block,torch.float16,10240,384,3072,256,8,0,0,asm,13001+23000,3899.8566 +gfx938,f8_w8a8_block,torch.float16,12288,384,3072,256,8,0,0,asm,13001+23000,4531.551 +gfx938,f8_w8a8_block,torch.float16,16384,384,3072,256,8,0,0,asm,13001+23000,5840.8341 +gfx938,f8_w8a8_block,torch.float16,24576,384,3072,256,8,0,0,asm,13001+23000,8385.0091 +gfx938,f8_w8a8_block,torch.float16,32768,384,3072,256,8,0,0,asm,13001+23000,11024.1562 +gfx938,f8_w8a8_block,torch.float16,1,1536,3072,64,8,0,0,asm,10006+20000,124.9272 +gfx938,f8_w8a8_block,torch.float16,2,1536,3072,64,8,0,0,asm,10002+20000,184.5058 +gfx938,f8_w8a8_block,torch.float16,4,1536,3072,64,8,0,0,asm,10006+20000,315.4275 +gfx938,f8_w8a8_block,torch.float16,6,1536,3072,64,8,0,0,asm,10006+20000,423.3682 +gfx938,f8_w8a8_block,torch.float16,8,1536,3072,64,8,0,0,asm,10006+20000,440.0586 +gfx938,f8_w8a8_block,torch.float16,12,1536,3072,64,8,0,0,asm,10006+20000,540.1087 +gfx938,f8_w8a8_block,torch.float16,16,1536,3072,64,8,0,0,asm,10006+20000,596.9254 +gfx938,f8_w8a8_block,torch.float16,20,1536,3072,64,8,0,0,asm,10006+20000,613.5538 +gfx938,f8_w8a8_block,torch.float16,24,1536,3072,64,8,0,0,asm,10006+20000,628.8296 +gfx938,f8_w8a8_block,torch.float16,28,1536,3072,64,8,0,0,asm,10006+20000,670.7272 +gfx938,f8_w8a8_block,torch.float16,32,1536,3072,64,8,0,0,asm,10006+20000,663.1146 +gfx938,f8_w8a8_block,torch.float16,36,1536,3072,64,8,0,0,asm,10006+20000,676.0126 +gfx938,f8_w8a8_block,torch.float16,40,1536,3072,64,8,0,0,asm,10006+20000,675.2884 +gfx938,f8_w8a8_block,torch.float16,44,1536,3072,64,8,0,0,asm,10006+20000,681.5031 +gfx938,f8_w8a8_block,torch.float16,48,1536,3072,64,8,0,0,asm,12003+22001,688.4757 +gfx938,f8_w8a8_block,torch.float16,56,1536,3072,64,8,0,0,asm,10006+20000,691.5746 +gfx938,f8_w8a8_block,torch.float16,64,1536,3072,64,8,0,0,asm,10006+20000,692.2429 +gfx938,f8_w8a8_block,torch.float16,72,1536,3072,64,8,0,0,asm,12003+22001,688.442 +gfx938,f8_w8a8_block,torch.float16,80,1536,3072,64,8,0,0,asm,10006+20000,690.0083 +gfx938,f8_w8a8_block,torch.float16,96,1536,3072,64,8,0,0,asm,12003+22002,711.1282 +gfx938,f8_w8a8_block,torch.float16,104,1536,3072,64,8,0,0,asm,12003+22001,696.5683 +gfx938,f8_w8a8_block,torch.float16,112,1536,3072,64,8,0,0,asm,12003+22001,698.0841 +gfx938,f8_w8a8_block,torch.float16,128,1536,3072,64,8,0,0,asm,12003+22001,720.4196 +gfx938,f8_w8a8_block,torch.float16,144,1536,3072,64,8,0,0,asm,12003+22001,702.7156 +gfx938,f8_w8a8_block,torch.float16,160,1536,3072,64,8,0,0,asm,12003+22001,711.6082 +gfx938,f8_w8a8_block,torch.float16,192,1536,3072,64,8,0,0,asm,12003+22001,761.4944 +gfx938,f8_w8a8_block,torch.float16,224,1536,3072,64,8,0,0,asm,12003+22001,720.0124 +gfx938,f8_w8a8_block,torch.float16,256,1536,3072,64,8,0,0,asm,12003+22001,723.625 +gfx938,f8_w8a8_block,torch.float16,320,1536,3072,64,8,0,0,asm,12003+22001,742.8671 +gfx938,f8_w8a8_block,torch.float16,384,1536,3072,64,8,0,0,asm,12003+22001,813.1488 +gfx938,f8_w8a8_block,torch.float16,448,1536,3072,64,8,0,0,asm,12003+22001,811.6667 +gfx938,f8_w8a8_block,torch.float16,512,1536,3072,64,8,0,0,asm,12003+22000,1008.7775 +gfx938,f8_w8a8_block,torch.float16,768,1536,3072,64,8,0,0,asm,13001+22000,1064.011 +gfx938,f8_w8a8_block,torch.float16,896,1536,3072,64,8,0,0,asm,13001+22000,1084.3057 +gfx938,f8_w8a8_block,torch.float16,960,1536,3072,64,8,0,0,asm,13001+22000,1237.1219 +gfx938,f8_w8a8_block,torch.float16,1024,1536,3072,64,8,0,0,asm,13001+22000,1476.5315 +gfx938,f8_w8a8_block,torch.float16,1280,1536,3072,64,8,0,0,asm,12005+22000,1796.6397 +gfx938,f8_w8a8_block,torch.float16,1536,1536,3072,64,8,0,0,asm,13001+22000,2017.7420 +gfx938,f8_w8a8_block,torch.float16,1920,1536,3072,64,8,0,0,asm,13001+22000,2136.5373 +gfx938,f8_w8a8_block,torch.float16,2048,1536,3072,64,8,0,0,asm,13001+22000,2407.4415 +gfx938,f8_w8a8_block,torch.float16,2304,1536,3072,64,8,0,0,asm,13001+23000,2837.9661 +gfx938,f8_w8a8_block,torch.float16,2560,1536,3072,64,8,0,0,asm,13001+23000,2906.9257 +gfx938,f8_w8a8_block,torch.float16,3072,1536,3072,64,8,0,0,asm,13001+22000,3408.5995 +gfx938,f8_w8a8_block,torch.float16,3584,1536,3072,64,8,0,0,asm,13001+23000,3882.9218 +gfx938,f8_w8a8_block,torch.float16,3840,1536,3072,64,8,0,0,asm,13001+23000,3938.888 +gfx938,f8_w8a8_block,torch.float16,4096,1536,3072,64,8,0,0,asm,13001+23000,4351.1558 +gfx938,f8_w8a8_block,torch.float16,4608,1536,3072,64,8,0,0,asm,13001+23000,4852.0379 +gfx938,f8_w8a8_block,torch.float16,5120,1536,3072,64,8,0,0,asm,13001+23000,5303.8594 +gfx938,f8_w8a8_block,torch.float16,6144,1536,3072,64,8,0,0,asm,13001+23000,6242.0787 +gfx938,f8_w8a8_block,torch.float16,7168,1536,3072,64,8,0,0,asm,13001+23000,7198.6896 +gfx938,f8_w8a8_block,torch.float16,8192,1536,3072,64,8,0,0,asm,13001+23000,8159.23309 +gfx938,f8_w8a8_block,torch.float16,10240,1536,3072,64,8,0,0,asm,13001+23000,10081.0697 +gfx938,f8_w8a8_block,torch.float16,12288,1536,3072,64,8,0,0,asm,13001+23000,11951.0998 +gfx938,f8_w8a8_block,torch.float16,16384,1536,3072,64,8,0,0,asm,13001+23000,15692.1712 +gfx938,f8_w8a8_block,torch.float16,24576,1536,3072,64,8,0,0,asm,13001+23000,23308.5444 +gfx938,f8_w8a8_block,torch.float16,32768,1536,3072,64,8,0,0,asm,13001+23000,30796.3452 diff --git a/aiter/configs/tuned_fmoe_asm_w8a8_group_shuffle.csv b/aiter/configs/tuned_fmoe_asm_w8a8_group_shuffle.csv new file mode 100644 index 0000000000000000000000000000000000000000..f8fb181325ca552ec4dbc2e67a2a1f4b3fb3ddcd --- /dev/null +++ b/aiter/configs/tuned_fmoe_asm_w8a8_group_shuffle.csv @@ -0,0 +1,1201 @@ +arch,quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +gfx936,int8_w8a8_block,torch.float16,1,128,7168,256,8,0,0,asm,10008+20000,62.7606 +gfx936,int8_w8a8_block,torch.float16,2,128,7168,256,8,0,0,asm,10007+20101,77.572 +gfx936,int8_w8a8_block,torch.float16,3,128,7168,256,8,0,0,asm,10001+20101,94.0854 +gfx936,int8_w8a8_block,torch.float16,4,128,7168,256,8,0,0,asm,10003+20101,111.1195 +gfx936,int8_w8a8_block,torch.float16,5,128,7168,256,8,0,0,asm,10007+20101,132.7017 +gfx936,int8_w8a8_block,torch.float16,6,128,7168,256,8,0,0,asm,10007+20101,155.4288 +gfx936,int8_w8a8_block,torch.float16,7,128,7168,256,8,0,0,asm,10007+20101,169.909 +gfx936,int8_w8a8_block,torch.float16,8,128,7168,256,8,0,0,asm,10007+20101,176.9162 +gfx936,int8_w8a8_block,torch.float16,9,128,7168,256,8,0,0,asm,10007+20101,192.9285 +gfx936,int8_w8a8_block,torch.float16,10,128,7168,256,8,0,0,asm,10007+20101,204.7479 +gfx936,int8_w8a8_block,torch.float16,11,128,7168,256,8,0,0,asm,10007+20101,217.0006 +gfx936,int8_w8a8_block,torch.float16,12,128,7168,256,8,0,0,asm,10007+20101,228.7678 +gfx936,int8_w8a8_block,torch.float16,13,128,7168,256,8,0,0,asm,10007+20101,248.1068 +gfx936,int8_w8a8_block,torch.float16,14,128,7168,256,8,0,0,asm,10007+20101,257.9829 +gfx936,int8_w8a8_block,torch.float16,15,128,7168,256,8,0,0,asm,10007+20101,266.59 +gfx936,int8_w8a8_block,torch.float16,16,128,7168,256,8,0,0,asm,10007+20101,273.1376 +gfx936,int8_w8a8_block,torch.float16,17,128,7168,256,8,0,0,asm,10007+20101,278.4476 +gfx936,int8_w8a8_block,torch.float16,18,128,7168,256,8,0,0,asm,10007+20101,287.6112 +gfx936,int8_w8a8_block,torch.float16,20,128,7168,256,8,0,0,asm,10007+20101,309.1361 +gfx936,int8_w8a8_block,torch.float16,24,128,7168,256,8,0,0,asm,10007+20101,353.1594 +gfx936,int8_w8a8_block,torch.float16,28,128,7168,256,8,0,0,asm,10007+20101,402.2906 +gfx936,int8_w8a8_block,torch.float16,32,128,7168,256,8,0,0,asm,10007+20000,432.625 +gfx936,int8_w8a8_block,torch.float16,34,128,7168,256,8,0,0,asm,10007+20000,429.6559 +gfx936,int8_w8a8_block,torch.float16,36,128,7168,256,8,0,0,asm,10007+20101,443.3995 +gfx936,int8_w8a8_block,torch.float16,40,128,7168,256,8,0,0,asm,10007+20101,461.6817 +gfx936,int8_w8a8_block,torch.float16,44,128,7168,256,8,0,0,asm,10007+20100,482.5334 +gfx936,int8_w8a8_block,torch.float16,48,128,7168,256,8,0,0,asm,10007+20100,494.1135 +gfx936,int8_w8a8_block,torch.float16,56,128,7168,256,8,0,0,asm,10007+20100,522.4518 +gfx936,int8_w8a8_block,torch.float16,64,128,7168,256,8,0,0,asm,10007+20100,541.7977 +gfx936,int8_w8a8_block,torch.float16,68,128,7168,256,8,0,0,asm,10007+20100,544.4355 +gfx936,int8_w8a8_block,torch.float16,72,128,7168,256,8,0,0,asm,10007+20100,559.8259 +gfx936,int8_w8a8_block,torch.float16,80,128,7168,256,8,0,0,asm,10007+20100,579.9696 +gfx936,int8_w8a8_block,torch.float16,88,128,7168,256,8,0,0,asm,10007+20100,595.2271 +gfx936,int8_w8a8_block,torch.float16,96,128,7168,256,8,0,0,asm,10007+20100,604.7679 +gfx936,int8_w8a8_block,torch.float16,104,128,7168,256,8,0,0,asm,10007+20100,611.5854 +gfx936,int8_w8a8_block,torch.float16,112,128,7168,256,8,0,0,asm,10007+20100,619.1943 +gfx936,int8_w8a8_block,torch.float16,128,128,7168,256,8,0,0,asm,10007+20100,632.2913 +gfx936,int8_w8a8_block,torch.float16,144,128,7168,256,8,0,0,asm,10007+20100,646.3492 +gfx936,int8_w8a8_block,torch.float16,160,128,7168,256,8,0,0,asm,10007+20100,653.7156 +gfx936,int8_w8a8_block,torch.float16,192,128,7168,256,8,0,0,asm,10007+20100,665.4176 +gfx936,int8_w8a8_block,torch.float16,224,128,7168,256,8,0,0,asm,10007+20100,686.6979 +gfx936,int8_w8a8_block,torch.float16,256,128,7168,256,8,0,0,asm,10007+20100,691.297 +gfx936,int8_w8a8_block,torch.float16,320,128,7168,256,8,0,0,asm,10007+20100,714.5745 +gfx936,int8_w8a8_block,torch.float16,384,128,7168,256,8,0,0,asm,10007+20000,744.3677 +gfx936,int8_w8a8_block,torch.float16,448,128,7168,256,8,0,0,asm,10007+20101,758.6653 +gfx936,int8_w8a8_block,torch.float16,512,128,7168,256,8,0,0,asm,10007+20101,786.8785 +gfx936,int8_w8a8_block,torch.float16,576,128,7168,256,8,0,0,asm,10002+20101,828.8703 +gfx936,int8_w8a8_block,torch.float16,640,128,7168,256,8,0,0,asm,10006+20101,843.8174 +gfx936,int8_w8a8_block,torch.float16,704,128,7168,256,8,0,0,asm,11007+21000,867.3545 +gfx936,int8_w8a8_block,torch.float16,768,128,7168,256,8,0,0,asm,11007+21000,893.5371 +gfx936,int8_w8a8_block,torch.float16,832,128,7168,256,8,0,0,asm,10004+20101,924.2304 +gfx936,int8_w8a8_block,torch.float16,896,128,7168,256,8,0,0,asm,10004+20101,949.6127 +gfx936,int8_w8a8_block,torch.float16,960,128,7168,256,8,0,0,asm,11007+21101,962.014 +gfx936,int8_w8a8_block,torch.float16,1024,128,7168,256,8,0,0,asm,11007+21101,982.5596 +gfx936,int8_w8a8_block,torch.float16,1152,128,7168,256,8,0,0,asm,11007+21101,1066.0459 +gfx936,int8_w8a8_block,torch.float16,1280,128,7168,256,8,0,0,asm,11007+21101,1116.8621 +gfx936,int8_w8a8_block,torch.float16,1408,128,7168,256,8,0,0,asm,12000+22000,1178.6121 +gfx936,int8_w8a8_block,torch.float16,1536,128,7168,256,8,0,0,asm,12001+22000,1193.2156 +gfx936,int8_w8a8_block,torch.float16,1664,128,7168,256,8,0,0,asm,11010+21000,1250.1117 +gfx936,int8_w8a8_block,torch.float16,1792,128,7168,256,8,0,0,asm,11010+21000,1293.8122 +gfx936,int8_w8a8_block,torch.float16,1920,128,7168,256,8,0,0,asm,11010+21000,1369.8287 +gfx936,int8_w8a8_block,torch.float16,2048,128,7168,256,8,0,0,asm,11010+21100,1422.1323 +gfx936,int8_w8a8_block,torch.float16,2304,128,7168,256,8,0,0,asm,11010+21000,1572.7088 +gfx936,int8_w8a8_block,torch.float16,2560,128,7168,256,8,0,0,asm,11009+21100,1688.1345 +gfx936,int8_w8a8_block,torch.float16,2816,128,7168,256,8,0,0,asm,11010+21100,1802.7783 +gfx936,int8_w8a8_block,torch.float16,3072,128,7168,256,8,0,0,asm,12003+22000,1897.6074 +gfx936,int8_w8a8_block,torch.float16,3328,128,7168,256,8,0,0,asm,12003+22000,1979.2772 +gfx936,int8_w8a8_block,torch.float16,3584,128,7168,256,8,0,0,asm,12003+22000,2043.1447 +gfx936,int8_w8a8_block,torch.float16,3840,128,7168,256,8,0,0,asm,12003+22000,2184.6107 +gfx936,int8_w8a8_block,torch.float16,4096,128,7168,256,8,0,0,asm,12003+22000,2334.7564 +gfx936,int8_w8a8_block,torch.float16,4608,128,7168,256,8,0,0,asm,12003+22000,2659.1505 +gfx936,int8_w8a8_block,torch.float16,5120,128,7168,256,8,0,0,asm,12003+22000,2817.7639 +gfx936,int8_w8a8_block,torch.float16,5632,128,7168,256,8,0,0,asm,12003+22000,2967.6633 +gfx936,int8_w8a8_block,torch.float16,6144,128,7168,256,8,0,0,asm,12003+22000,3291.914 +gfx936,int8_w8a8_block,torch.float16,6656,128,7168,256,8,0,0,asm,13001+23000,3421.6085 +gfx936,int8_w8a8_block,torch.float16,7168,128,7168,256,8,0,0,asm,13001+23000,3532.8742 +gfx936,int8_w8a8_block,torch.float16,7680,128,7168,256,8,0,0,asm,13001+23000,3724.739 +gfx936,int8_w8a8_block,torch.float16,8192,128,7168,256,8,0,0,asm,13001+23000,4133.3506 +gfx936,int8_w8a8_block,torch.float16,10240,128,7168,256,8,0,0,asm,13001+23000,5080.0063 +gfx936,int8_w8a8_block,torch.float16,12288,128,7168,256,8,0,0,asm,13001+23000,5935.4214 +gfx936,int8_w8a8_block,torch.float16,14336,128,7168,256,8,0,0,asm,13001+23000,6797.3674 +gfx936,int8_w8a8_block,torch.float16,16384,128,7168,256,8,0,0,asm,13001+23000,7704.9349 +gfx936,int8_w8a8_block,torch.float16,17408,128,7168,256,8,0,0,asm,13001+23000,8254.6709 +gfx936,int8_w8a8_block,torch.float16,24576,128,7168,256,8,0,0,asm,13001+23000,11162.5584 +gfx936,int8_w8a8_block,torch.float16,32768,128,7168,256,8,0,0,asm,13001+23000,14747.3124 +gfx936,int8_w8a8_block,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,14859.3376 +gfx936,int8_w8a8_block,torch.float16,49152,128,7168,256,8,0,0,asm,13001+23000,21041.0904 +gfx936,int8_w8a8_block,torch.float16,57344,128,7168,256,8,0,0,asm,13001+23000,24838.7834 +gfx936,int8_w8a8_block,torch.float16,1,256,7168,256,8,0,0,asm,10008+20000,80.7245 +gfx936,int8_w8a8_block,torch.float16,2,256,7168,256,8,0,0,asm,10007+20000,118.7213 +gfx936,int8_w8a8_block,torch.float16,3,256,7168,256,8,0,0,asm,10007+20000,156.7637 +gfx936,int8_w8a8_block,torch.float16,4,256,7168,256,8,0,0,asm,10003+20000,200.2542 +gfx936,int8_w8a8_block,torch.float16,5,256,7168,256,8,0,0,asm,10007+20000,224.9911 +gfx936,int8_w8a8_block,torch.float16,6,256,7168,256,8,0,0,asm,10007+20000,253.4003 +gfx936,int8_w8a8_block,torch.float16,7,256,7168,256,8,0,0,asm,10007+20000,285.5153 +gfx936,int8_w8a8_block,torch.float16,8,256,7168,256,8,0,0,asm,10007+20000,302.3235 +gfx936,int8_w8a8_block,torch.float16,9,256,7168,256,8,0,0,asm,10007+20000,332.5963 +gfx936,int8_w8a8_block,torch.float16,10,256,7168,256,8,0,0,asm,10007+20000,356.543 +gfx936,int8_w8a8_block,torch.float16,11,256,7168,256,8,0,0,asm,10007+20000,379.9048 +gfx936,int8_w8a8_block,torch.float16,12,256,7168,256,8,0,0,asm,10007+20000,396.6579 +gfx936,int8_w8a8_block,torch.float16,13,256,7168,256,8,0,0,asm,10007+20000,421.5194 +gfx936,int8_w8a8_block,torch.float16,14,256,7168,256,8,0,0,asm,10007+20000,449.1545 +gfx936,int8_w8a8_block,torch.float16,15,256,7168,256,8,0,0,asm,10007+20000,464.3584 +gfx936,int8_w8a8_block,torch.float16,16,256,7168,256,8,0,0,asm,10007+20000,476.141 +gfx936,int8_w8a8_block,torch.float16,17,256,7168,256,8,0,0,asm,10007+20000,487.9287 +gfx936,int8_w8a8_block,torch.float16,18,256,7168,256,8,0,0,asm,10007+20000,510.899 +gfx936,int8_w8a8_block,torch.float16,20,256,7168,256,8,0,0,asm,10007+20000,549.3483 +gfx936,int8_w8a8_block,torch.float16,24,256,7168,256,8,0,0,asm,10007+20000,629.1473 +gfx936,int8_w8a8_block,torch.float16,28,256,7168,256,8,0,0,asm,10007+20000,707.0826 +gfx936,int8_w8a8_block,torch.float16,32,256,7168,256,8,0,0,asm,10007+20000,759.8868 +gfx936,int8_w8a8_block,torch.float16,34,256,7168,256,8,0,0,asm,10007+20000,764.212 +gfx936,int8_w8a8_block,torch.float16,36,256,7168,256,8,0,0,asm,10007+20000,794.3741 +gfx936,int8_w8a8_block,torch.float16,40,256,7168,256,8,0,0,asm,10007+20000,826.9443 +gfx936,int8_w8a8_block,torch.float16,44,256,7168,256,8,0,0,asm,10007+20000,865.1451 +gfx936,int8_w8a8_block,torch.float16,48,256,7168,256,8,0,0,asm,10007+20000,888.0602 +gfx936,int8_w8a8_block,torch.float16,56,256,7168,256,8,0,0,asm,10007+20000,934.0327 +gfx936,int8_w8a8_block,torch.float16,64,256,7168,256,8,0,0,asm,10007+20000,969.9721 +gfx936,int8_w8a8_block,torch.float16,68,256,7168,256,8,0,0,asm,10007+20000,979.8173 +gfx936,int8_w8a8_block,torch.float16,72,256,7168,256,8,0,0,asm,10007+20000,1006.7779 +gfx936,int8_w8a8_block,torch.float16,80,256,7168,256,8,0,0,asm,10007+20000,1042.8624 +gfx936,int8_w8a8_block,torch.float16,88,256,7168,256,8,0,0,asm,10007+20000,1074.5793 +gfx936,int8_w8a8_block,torch.float16,96,256,7168,256,8,0,0,asm,10007+20000,1093.5004 +gfx936,int8_w8a8_block,torch.float16,104,256,7168,256,8,0,0,asm,10007+20000,1102.8706 +gfx936,int8_w8a8_block,torch.float16,112,256,7168,256,8,0,0,asm,10007+20000,1120.0809 +gfx936,int8_w8a8_block,torch.float16,128,256,7168,256,8,0,0,asm,10007+20000,1151.5798 +gfx936,int8_w8a8_block,torch.float16,144,256,7168,256,8,0,0,asm,10007+20000,1163.1415 +gfx936,int8_w8a8_block,torch.float16,160,256,7168,256,8,0,0,asm,10007+20000,1176.8686 +gfx936,int8_w8a8_block,torch.float16,192,256,7168,256,8,0,0,asm,10007+20000,1193.123 +gfx936,int8_w8a8_block,torch.float16,224,256,7168,256,8,0,0,asm,10007+20000,1219.1989 +gfx936,int8_w8a8_block,torch.float16,256,256,7168,256,8,0,0,asm,10007+20000,1234.1733 +gfx936,int8_w8a8_block,torch.float16,320,256,7168,256,8,0,0,asm,10007+20000,1247.3027 +gfx936,int8_w8a8_block,torch.float16,384,256,7168,256,8,0,0,asm,10007+20000,1274.2434 +gfx936,int8_w8a8_block,torch.float16,448,256,7168,256,8,0,0,asm,10007+20000,1308.2962 +gfx936,int8_w8a8_block,torch.float16,512,256,7168,256,8,0,0,asm,11005+21000,1362.198 +gfx936,int8_w8a8_block,torch.float16,576,256,7168,256,8,0,0,asm,11007+21000,1392.7605 +gfx936,int8_w8a8_block,torch.float16,640,256,7168,256,8,0,0,asm,11007+21000,1412.2071 +gfx936,int8_w8a8_block,torch.float16,704,256,7168,256,8,0,0,asm,11007+21000,1444.9837 +gfx936,int8_w8a8_block,torch.float16,768,256,7168,256,8,0,0,asm,11007+21000,1470.1744 +gfx936,int8_w8a8_block,torch.float16,832,256,7168,256,8,0,0,asm,11007+21000,1497.4842 +gfx936,int8_w8a8_block,torch.float16,896,256,7168,256,8,0,0,asm,11007+21000,1518.252 +gfx936,int8_w8a8_block,torch.float16,960,256,7168,256,8,0,0,asm,11007+21000,1528.7868 +gfx936,int8_w8a8_block,torch.float16,1024,256,7168,256,8,0,0,asm,11007+21000,1552.9606 +gfx936,int8_w8a8_block,torch.float16,1152,256,7168,256,8,0,0,asm,12000+22000,1637.8453 +gfx936,int8_w8a8_block,torch.float16,1280,256,7168,256,8,0,0,asm,12000+22000,1670.9866 +gfx936,int8_w8a8_block,torch.float16,1408,256,7168,256,8,0,0,asm,12000+22000,1735.3633 +gfx936,int8_w8a8_block,torch.float16,1536,256,7168,256,8,0,0,asm,11010+21000,1778.5593 +gfx936,int8_w8a8_block,torch.float16,1664,256,7168,256,8,0,0,asm,12000+22000,1833.8578 +gfx936,int8_w8a8_block,torch.float16,1792,256,7168,256,8,0,0,asm,12001+22000,1910.7194 +gfx936,int8_w8a8_block,torch.float16,1920,256,7168,256,8,0,0,asm,12001+22000,1972.2045 +gfx936,int8_w8a8_block,torch.float16,2048,256,7168,256,8,0,0,asm,12001+22000,2084.3359 +gfx936,int8_w8a8_block,torch.float16,2304,256,7168,256,8,0,0,asm,12003+22000,2256.3812 +gfx936,int8_w8a8_block,torch.float16,2560,256,7168,256,8,0,0,asm,12003+22000,2383.4745 +gfx936,int8_w8a8_block,torch.float16,2816,256,7168,256,8,0,0,asm,12003+22000,2460.2104 +gfx936,int8_w8a8_block,torch.float16,3072,256,7168,256,8,0,0,asm,12003+22000,2541.0377 +gfx936,int8_w8a8_block,torch.float16,3328,256,7168,256,8,0,0,asm,12002+22000,2635.4526 +gfx936,int8_w8a8_block,torch.float16,3584,256,7168,256,8,0,0,asm,12003+22000,2742.33 +gfx936,int8_w8a8_block,torch.float16,3840,256,7168,256,8,0,0,asm,12003+22000,2879.9588 +gfx936,int8_w8a8_block,torch.float16,4096,256,7168,256,8,0,0,asm,12003+22000,3101.3169 +gfx936,int8_w8a8_block,torch.float16,4608,256,7168,256,8,0,0,asm,12003+22000,3600.3408 +gfx936,int8_w8a8_block,torch.float16,5120,256,7168,256,8,0,0,asm,12003+22000,3814.8517 +gfx936,int8_w8a8_block,torch.float16,5632,256,7168,256,8,0,0,asm,12003+22000,3988.5398 +gfx936,int8_w8a8_block,torch.float16,6144,256,7168,256,8,0,0,asm,12002+22000,4396.1157 +gfx936,int8_w8a8_block,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23000,4648.7067 +gfx936,int8_w8a8_block,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23000,4818.7636 +gfx936,int8_w8a8_block,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23000,5024.0016 +gfx936,int8_w8a8_block,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23000,5549.7372 +gfx936,int8_w8a8_block,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23000,6802.854 +gfx936,int8_w8a8_block,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23000,7901.7596 +gfx936,int8_w8a8_block,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23000,9098.0764 +gfx936,int8_w8a8_block,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23000,10190.1389 +gfx936,int8_w8a8_block,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23000,10946.5745 +gfx936,int8_w8a8_block,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23000,14740.2802 +gfx936,int8_w8a8_block,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23000,19489.3806 +gfx936,int8_w8a8_block,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23000,22699.7344 +gfx936,int8_w8a8_block,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23000,27492.7934 +gfx938,int8_w8a8_block,torch.float16,1,128,7168,256,8,0,0,asm,10008+20100,64.3619 +gfx938,int8_w8a8_block,torch.float16,2,128,7168,256,8,0,0,asm,10007+20101,74.3393 +gfx938,int8_w8a8_block,torch.float16,3,128,7168,256,8,0,0,asm,10002+20101,85.2683 +gfx938,int8_w8a8_block,torch.float16,4,128,7168,256,8,0,0,asm,10007+20101,102.7658 +gfx938,int8_w8a8_block,torch.float16,5,128,7168,256,8,0,0,asm,10006+20101,126.6177 +gfx938,int8_w8a8_block,torch.float16,6,128,7168,256,8,0,0,asm,10006+20101,131.0115 +gfx938,int8_w8a8_block,torch.float16,7,128,7168,256,8,0,0,asm,10005+20101,137.7564 +gfx938,int8_w8a8_block,torch.float16,8,128,7168,256,8,0,0,asm,10006+20101,143.7584 +gfx938,int8_w8a8_block,torch.float16,9,128,7168,256,8,0,0,asm,10002+20101,157.2804 +gfx938,int8_w8a8_block,torch.float16,10,128,7168,256,8,0,0,asm,10002+20101,170.1256 +gfx938,int8_w8a8_block,torch.float16,11,128,7168,256,8,0,0,asm,10007+20101,202.2689 +gfx938,int8_w8a8_block,torch.float16,12,128,7168,256,8,0,0,asm,10002+20101,200.4762 +gfx938,int8_w8a8_block,torch.float16,13,128,7168,256,8,0,0,asm,10002+20101,204.1725 +gfx938,int8_w8a8_block,torch.float16,14,128,7168,256,8,0,0,asm,10002+20101,209.0177 +gfx938,int8_w8a8_block,torch.float16,15,128,7168,256,8,0,0,asm,10001+20101,214.7243 +gfx938,int8_w8a8_block,torch.float16,16,128,7168,256,8,0,0,asm,10002+20101,218.2157 +gfx938,int8_w8a8_block,torch.float16,17,128,7168,256,8,0,0,asm,10003+20101,222.0033 +gfx938,int8_w8a8_block,torch.float16,18,128,7168,256,8,0,0,asm,10002+20101,226.8935 +gfx938,int8_w8a8_block,torch.float16,20,128,7168,256,8,0,0,asm,10006+20101,253.5108 +gfx938,int8_w8a8_block,torch.float16,24,128,7168,256,8,0,0,asm,10002+20101,277.08 +gfx938,int8_w8a8_block,torch.float16,28,128,7168,256,8,0,0,asm,10002+20101,327.7822 +gfx938,int8_w8a8_block,torch.float16,32,128,7168,256,8,0,0,asm,10002+20101,338.7196 +gfx938,int8_w8a8_block,torch.float16,34,128,7168,256,8,0,0,asm,10002+20101,323.1064 +gfx938,int8_w8a8_block,torch.float16,36,128,7168,256,8,0,0,asm,10001+20101,336.2674 +gfx938,int8_w8a8_block,torch.float16,40,128,7168,256,8,0,0,asm,10005+20101,361.1001 +gfx938,int8_w8a8_block,torch.float16,44,128,7168,256,8,0,0,asm,10006+20101,371.578 +gfx938,int8_w8a8_block,torch.float16,48,128,7168,256,8,0,0,asm,10002+20101,376.6324 +gfx938,int8_w8a8_block,torch.float16,56,128,7168,256,8,0,0,asm,10002+20101,390.3019 +gfx938,int8_w8a8_block,torch.float16,64,128,7168,256,8,0,0,asm,10002+20101,415.6762 +gfx938,int8_w8a8_block,torch.float16,68,128,7168,256,8,0,0,asm,10002+20101,429.6441 +gfx938,int8_w8a8_block,torch.float16,72,128,7168,256,8,0,0,asm,10002+20101,430.7736 +gfx938,int8_w8a8_block,torch.float16,80,128,7168,256,8,0,0,asm,10007+20101,452.7017 +gfx938,int8_w8a8_block,torch.float16,88,128,7168,256,8,0,0,asm,10002+20101,447.3998 +gfx938,int8_w8a8_block,torch.float16,96,128,7168,256,8,0,0,asm,10002+20101,447.6351 +gfx938,int8_w8a8_block,torch.float16,104,128,7168,256,8,0,0,asm,10001+20101,460.7016 +gfx938,int8_w8a8_block,torch.float16,112,128,7168,256,8,0,0,asm,10002+20101,460.3201 +gfx938,int8_w8a8_block,torch.float16,128,128,7168,256,8,0,0,asm,10002+20101,463.9509 +gfx938,int8_w8a8_block,torch.float16,144,128,7168,256,8,0,0,asm,10002+20101,489.5343 +gfx938,int8_w8a8_block,torch.float16,160,128,7168,256,8,0,0,asm,10002+20101,494.0718 +gfx938,int8_w8a8_block,torch.float16,192,128,7168,256,8,0,0,asm,10002+20101,503.7496 +gfx938,int8_w8a8_block,torch.float16,224,128,7168,256,8,0,0,asm,10001+20101,509.2415 +gfx938,int8_w8a8_block,torch.float16,256,128,7168,256,8,0,0,asm,10005+20101,514.1414 +gfx938,int8_w8a8_block,torch.float16,320,128,7168,256,8,0,0,asm,10006+20101,533.5136 +gfx938,int8_w8a8_block,torch.float16,384,128,7168,256,8,0,0,asm,10002+20101,538.608 +gfx938,int8_w8a8_block,torch.float16,448,128,7168,256,8,0,0,asm,10002+20101,563.3719 +gfx938,int8_w8a8_block,torch.float16,512,128,7168,256,8,0,0,asm,10005+20101,587.4498 +gfx938,int8_w8a8_block,torch.float16,576,128,7168,256,8,0,0,asm,11010+21101,608.6394 +gfx938,int8_w8a8_block,torch.float16,640,128,7168,256,8,0,0,asm,11010+21101,632.7871 +gfx938,int8_w8a8_block,torch.float16,704,128,7168,256,8,0,0,asm,11010+21101,641.8249 +gfx938,int8_w8a8_block,torch.float16,768,128,7168,256,8,0,0,asm,11010+21101,654.0136 +gfx938,int8_w8a8_block,torch.float16,832,128,7168,256,8,0,0,asm,11009+21101,663.6258 +gfx938,int8_w8a8_block,torch.float16,896,128,7168,256,8,0,0,asm,11006+21101,696.7302 +gfx938,int8_w8a8_block,torch.float16,960,128,7168,256,8,0,0,asm,11010+21101,711.3972 +gfx938,int8_w8a8_block,torch.float16,1024,128,7168,256,8,0,0,asm,11009+21101,761.4979 +gfx938,int8_w8a8_block,torch.float16,1152,128,7168,256,8,0,0,asm,11010+21101,829.2662 +gfx938,int8_w8a8_block,torch.float16,1280,128,7168,256,8,0,0,asm,11010+21101,870.6156 +gfx938,int8_w8a8_block,torch.float16,1408,128,7168,256,8,0,0,asm,11010+21101,897.6952 +gfx938,int8_w8a8_block,torch.float16,1536,128,7168,256,8,0,0,asm,11010+21101,981.7316 +gfx938,int8_w8a8_block,torch.float16,1664,128,7168,256,8,0,0,asm,11010+21101,997.8185 +gfx938,int8_w8a8_block,torch.float16,1792,128,7168,256,8,0,0,asm,11010+21101,1033.4067 +gfx938,int8_w8a8_block,torch.float16,1920,128,7168,256,8,0,0,asm,11010+21101,1108.3387 +gfx938,int8_w8a8_block,torch.float16,2048,128,7168,256,8,0,0,asm,11010+21101,1145.0203 +gfx938,int8_w8a8_block,torch.float16,2304,128,7168,256,8,0,0,asm,11010+21101,1316.086 +gfx938,int8_w8a8_block,torch.float16,2560,128,7168,256,8,0,0,asm,11010+21101,1387.9345 +gfx938,int8_w8a8_block,torch.float16,2816,128,7168,256,8,0,0,asm,11010+21101,1488.2435 +gfx938,int8_w8a8_block,torch.float16,3072,128,7168,256,8,0,0,asm,11010+21101,1620.1726 +gfx938,int8_w8a8_block,torch.float16,3328,128,7168,256,8,0,0,asm,11010+21101,1742.2218 +gfx938,int8_w8a8_block,torch.float16,3584,128,7168,256,8,0,0,asm,11010+21101,1862.3263 +gfx938,int8_w8a8_block,torch.float16,3840,128,7168,256,8,0,0,asm,11010+21101,1934.8699 +gfx938,int8_w8a8_block,torch.float16,4096,128,7168,256,8,0,0,asm,11010+21101,2045.1929 +gfx938,int8_w8a8_block,torch.float16,4608,128,7168,256,8,0,0,asm,11010+21101,2279.0466 +gfx938,int8_w8a8_block,torch.float16,5120,128,7168,256,8,0,0,asm,11010+21101,2509.8803 +gfx938,int8_w8a8_block,torch.float16,5632,128,7168,256,8,0,0,asm,11010+21101,2757.2319 +gfx938,int8_w8a8_block,torch.float16,6144,128,7168,256,8,0,0,asm,11010+21101,2985.8396 +gfx938,int8_w8a8_block,torch.float16,6656,128,7168,256,8,0,0,asm,11010+21101,3184.1954 +gfx938,int8_w8a8_block,torch.float16,7168,128,7168,256,8,0,0,asm,11010+21101,3425.8325 +gfx938,int8_w8a8_block,torch.float16,7680,128,7168,256,8,0,0,asm,11010+21101,3647.3802 +gfx938,int8_w8a8_block,torch.float16,8192,128,7168,256,8,0,0,asm,11010+21101,3867.3574 +gfx938,int8_w8a8_block,torch.float16,10240,128,7168,256,8,0,0,asm,11010+21101,4765.7119 +gfx938,int8_w8a8_block,torch.float16,12288,128,7168,256,8,0,0,asm,11010+21101,5724.0817 +gfx938,int8_w8a8_block,torch.float16,14336,128,7168,256,8,0,0,asm,11010+21101,6569.5353 +gfx938,int8_w8a8_block,torch.float16,16384,128,7168,256,8,0,0,asm,11010+21101,7444.7413 +gfx938,int8_w8a8_block,torch.float16,17408,128,7168,256,8,0,0,asm,11010+21101,7919.0884 +gfx938,int8_w8a8_block,torch.float16,24576,128,7168,256,8,0,0,asm,11010+21101,11183.5046 +gfx938,int8_w8a8_block,torch.float16,32768,128,7168,256,8,0,0,asm,11010+21101,14692.1092 +gfx938,int8_w8a8_block,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,15532.3525 +gfx938,int8_w8a8_block,torch.float16,49152,128,7168,256,8,0,0,asm,11010+21101,21012.5975 +gfx938,int8_w8a8_block,torch.float16,57344,128,7168,256,8,0,0,asm,11010+21101,24817.8367 +gfx938,int8_w8a8_block,torch.float16,65536,128,7168,256,8,0,0,asm,11010+21101,28757.9419 +gfx938,int8_w8a8_block,torch.float16,1,256,7168,256,8,0,0,asm,10007+20000,74.0639 +gfx938,int8_w8a8_block,torch.float16,2,256,7168,256,8,0,0,asm,10002+20000,102.3192 +gfx938,int8_w8a8_block,torch.float16,3,256,7168,256,8,0,0,asm,10006+20000,135.7052 +gfx938,int8_w8a8_block,torch.float16,4,256,7168,256,8,0,0,asm,10002+20000,169.3255 +gfx938,int8_w8a8_block,torch.float16,5,256,7168,256,8,0,0,asm,10002+20000,197.5692 +gfx938,int8_w8a8_block,torch.float16,6,256,7168,256,8,0,0,asm,10002+20000,206.6023 +gfx938,int8_w8a8_block,torch.float16,7,256,7168,256,8,0,0,asm,10003+20000,229.8469 +gfx938,int8_w8a8_block,torch.float16,8,256,7168,256,8,0,0,asm,10006+20000,243.5164 +gfx938,int8_w8a8_block,torch.float16,9,256,7168,256,8,0,0,asm,10006+20000,260.1334 +gfx938,int8_w8a8_block,torch.float16,10,256,7168,256,8,0,0,asm,10005+20000,280.0511 +gfx938,int8_w8a8_block,torch.float16,11,256,7168,256,8,0,0,asm,10002+20000,305.6738 +gfx938,int8_w8a8_block,torch.float16,12,256,7168,256,8,0,0,asm,10002+20000,310.3367 +gfx938,int8_w8a8_block,torch.float16,13,256,7168,256,8,0,0,asm,10001+20000,323.612 +gfx938,int8_w8a8_block,torch.float16,14,256,7168,256,8,0,0,asm,10006+20000,341.7984 +gfx938,int8_w8a8_block,torch.float16,15,256,7168,256,8,0,0,asm,10005+20000,359.621 +gfx938,int8_w8a8_block,torch.float16,16,256,7168,256,8,0,0,asm,10006+20000,361.7879 +gfx938,int8_w8a8_block,torch.float16,17,256,7168,256,8,0,0,asm,10006+20000,365.4648 +gfx938,int8_w8a8_block,torch.float16,18,256,7168,256,8,0,0,asm,10005+20000,379.5129 +gfx938,int8_w8a8_block,torch.float16,20,256,7168,256,8,0,0,asm,10001+20000,410.4977 +gfx938,int8_w8a8_block,torch.float16,24,256,7168,256,8,0,0,asm,10006+20000,460.7313 +gfx938,int8_w8a8_block,torch.float16,28,256,7168,256,8,0,0,asm,10002+20000,523.8077 +gfx938,int8_w8a8_block,torch.float16,32,256,7168,256,8,0,0,asm,10006+20000,555.1036 +gfx938,int8_w8a8_block,torch.float16,34,256,7168,256,8,0,0,asm,10006+20000,554.0263 +gfx938,int8_w8a8_block,torch.float16,36,256,7168,256,8,0,0,asm,10002+20000,572.3591 +gfx938,int8_w8a8_block,torch.float16,40,256,7168,256,8,0,0,asm,10001+20000,593.9758 +gfx938,int8_w8a8_block,torch.float16,44,256,7168,256,8,0,0,asm,10001+20000,617.3905 +gfx938,int8_w8a8_block,torch.float16,48,256,7168,256,8,0,0,asm,10001+20000,633.2489 +gfx938,int8_w8a8_block,torch.float16,56,256,7168,256,8,0,0,asm,10002+20000,662.2223 +gfx938,int8_w8a8_block,torch.float16,64,256,7168,256,8,0,0,asm,10002+20000,685.0154 +gfx938,int8_w8a8_block,torch.float16,68,256,7168,256,8,0,0,asm,10001+20000,695.8222 +gfx938,int8_w8a8_block,torch.float16,72,256,7168,256,8,0,0,asm,10001+20000,709.9707 +gfx938,int8_w8a8_block,torch.float16,80,256,7168,256,8,0,0,asm,10001+20000,774.6035 +gfx938,int8_w8a8_block,torch.float16,88,256,7168,256,8,0,0,asm,10002+20000,757.6815 +gfx938,int8_w8a8_block,torch.float16,96,256,7168,256,8,0,0,asm,10002+20000,760.8093 +gfx938,int8_w8a8_block,torch.float16,104,256,7168,256,8,0,0,asm,10002+20000,779.7121 +gfx938,int8_w8a8_block,torch.float16,112,256,7168,256,8,0,0,asm,10002+20000,776.0946 +gfx938,int8_w8a8_block,torch.float16,128,256,7168,256,8,0,0,asm,10002+20000,792.7909 +gfx938,int8_w8a8_block,torch.float16,144,256,7168,256,8,0,0,asm,10002+20000,800.8822 +gfx938,int8_w8a8_block,torch.float16,160,256,7168,256,8,0,0,asm,10002+20000,812.1142 +gfx938,int8_w8a8_block,torch.float16,192,256,7168,256,8,0,0,asm,10002+20000,825.0625 +gfx938,int8_w8a8_block,torch.float16,224,256,7168,256,8,0,0,asm,10002+20000,833.9061 +gfx938,int8_w8a8_block,torch.float16,256,256,7168,256,8,0,0,asm,10002+20000,844.1267 +gfx938,int8_w8a8_block,torch.float16,320,256,7168,256,8,0,0,asm,10002+20000,872.8442 +gfx938,int8_w8a8_block,torch.float16,384,256,7168,256,8,0,0,asm,10005+20000,903.3915 +gfx938,int8_w8a8_block,torch.float16,448,256,7168,256,8,0,0,asm,10006+20000,945.5806 +gfx938,int8_w8a8_block,torch.float16,512,256,7168,256,8,0,0,asm,11006+21000,960.4355 +gfx938,int8_w8a8_block,torch.float16,576,256,7168,256,8,0,0,asm,11006+21000,975.8254 +gfx938,int8_w8a8_block,torch.float16,640,256,7168,256,8,0,0,asm,11006+21000,997.3548 +gfx938,int8_w8a8_block,torch.float16,704,256,7168,256,8,0,0,asm,11006+21000,1015.4870000000001 +gfx938,int8_w8a8_block,torch.float16,768,256,7168,256,8,0,0,asm,11010+21000,1032.7099 +gfx938,int8_w8a8_block,torch.float16,832,256,7168,256,8,0,0,asm,11010+21000,1051.2385 +gfx938,int8_w8a8_block,torch.float16,896,256,7168,256,8,0,0,asm,11010+21000,1096.4795 +gfx938,int8_w8a8_block,torch.float16,960,256,7168,256,8,0,0,asm,11010+21000,1153.3346 +gfx938,int8_w8a8_block,torch.float16,1024,256,7168,256,8,0,0,asm,11010+21000,1210.1494 +gfx938,int8_w8a8_block,torch.float16,1152,256,7168,256,8,0,0,asm,11010+21000,1356.8225 +gfx938,int8_w8a8_block,torch.float16,1280,256,7168,256,8,0,0,asm,12003+22000,1430.5484 +gfx938,int8_w8a8_block,torch.float16,1408,256,7168,256,8,0,0,asm,12003+22000,1462.7597 +gfx938,int8_w8a8_block,torch.float16,1536,256,7168,256,8,0,0,asm,12003+22000,1491.0231 +gfx938,int8_w8a8_block,torch.float16,1664,256,7168,256,8,0,0,asm,12003+22000,1532.0672 +gfx938,int8_w8a8_block,torch.float16,1792,256,7168,256,8,0,0,asm,12003+22000,1634.4874 +gfx938,int8_w8a8_block,torch.float16,1920,256,7168,256,8,0,0,asm,12003+22000,1704.1272 +gfx938,int8_w8a8_block,torch.float16,2048,256,7168,256,8,0,0,asm,11010+21000,1906.7057 +gfx938,int8_w8a8_block,torch.float16,2304,256,7168,256,8,0,0,asm,11010+21000,2163.0306 +gfx938,int8_w8a8_block,torch.float16,2560,256,7168,256,8,0,0,asm,11010+21000,2300.1239 +gfx938,int8_w8a8_block,torch.float16,2816,256,7168,256,8,0,0,asm,11010+21000,2437.1261 +gfx938,int8_w8a8_block,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23000,2637.9526 +gfx938,int8_w8a8_block,torch.float16,3328,256,7168,256,8,0,0,asm,12003+22000,2726.2909 +gfx938,int8_w8a8_block,torch.float16,3584,256,7168,256,8,0,0,asm,12003+22000,2809.1701 +gfx938,int8_w8a8_block,torch.float16,3840,256,7168,256,8,0,0,asm,12003+22000,2992.6435 +gfx938,int8_w8a8_block,torch.float16,4096,256,7168,256,8,0,0,asm,12003+22000,3289.1543 +gfx938,int8_w8a8_block,torch.float16,4608,256,7168,256,8,0,0,asm,11010+21000,3804.5302 +gfx938,int8_w8a8_block,torch.float16,5120,256,7168,256,8,0,0,asm,12003+22000,4015.5876 +gfx938,int8_w8a8_block,torch.float16,5632,256,7168,256,8,0,0,asm,12003+22000,4216.9662 +gfx938,int8_w8a8_block,torch.float16,6144,256,7168,256,8,0,0,asm,12003+22000,4648.8637 +gfx938,int8_w8a8_block,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23000,4987.557 +gfx938,int8_w8a8_block,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23000,5097.7043 +gfx938,int8_w8a8_block,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23000,5418.374 +gfx938,int8_w8a8_block,torch.float16,8192,256,7168,256,8,0,0,asm,12003+22000,5973.1091 +gfx938,int8_w8a8_block,torch.float16,10240,256,7168,256,8,0,0,asm,12003+22000,7325.1873 +gfx938,int8_w8a8_block,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23000,8555.6002 +gfx938,int8_w8a8_block,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23000,9886.2731 +gfx938,int8_w8a8_block,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23000,11056.8632 +gfx938,int8_w8a8_block,torch.float16,17408,256,7168,256,8,0,0,asm,12003+22000,12029.4321 +gfx938,int8_w8a8_block,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23000,16199.1447 +gfx938,int8_w8a8_block,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23000,21211.1206 +gfx938,int8_w8a8_block,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23000,25292.6729 +gfx938,int8_w8a8_block,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23000,30310.2679 +gfx938,int8_w8a8_block,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23000,35514.6339 +gfx938,int8_w8a8_block,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23000,40549.9911 +gfx938,f8_w8a8_block,torch.float16,1,128,7168,256,8,0,0,asm,10008+20000,64.5697 +gfx938,f8_w8a8_block,torch.float16,2,128,7168,256,8,0,0,asm,10007+20101,75.7958 +gfx938,f8_w8a8_block,torch.float16,3,128,7168,256,8,0,0,asm,10002+20101,88.1569 +gfx938,f8_w8a8_block,torch.float16,4,128,7168,256,8,0,0,asm,10002+20101,101.3875 +gfx938,f8_w8a8_block,torch.float16,5,128,7168,256,8,0,0,asm,10004+20101,124.3946 +gfx938,f8_w8a8_block,torch.float16,6,128,7168,256,8,0,0,asm,10006+20101,130.9464 +gfx938,f8_w8a8_block,torch.float16,7,128,7168,256,8,0,0,asm,10006+20101,137.4365 +gfx938,f8_w8a8_block,torch.float16,8,128,7168,256,8,0,0,asm,10006+20101,143.0817 +gfx938,f8_w8a8_block,torch.float16,9,128,7168,256,8,0,0,asm,10002+20101,157.5882 +gfx938,f8_w8a8_block,torch.float16,10,128,7168,256,8,0,0,asm,10002+20101,168.0538 +gfx938,f8_w8a8_block,torch.float16,11,128,7168,256,8,0,0,asm,10002+20101,198.4332 +gfx938,f8_w8a8_block,torch.float16,12,128,7168,256,8,0,0,asm,10007+20101,200.4189 +gfx938,f8_w8a8_block,torch.float16,13,128,7168,256,8,0,0,asm,10002+20101,203.4917 +gfx938,f8_w8a8_block,torch.float16,14,128,7168,256,8,0,0,asm,10002+20101,206.6752 +gfx938,f8_w8a8_block,torch.float16,15,128,7168,256,8,0,0,asm,10002+20101,211.3234 +gfx938,f8_w8a8_block,torch.float16,16,128,7168,256,8,0,0,asm,10002+20101,215.7665 +gfx938,f8_w8a8_block,torch.float16,17,128,7168,256,8,0,0,asm,10002+20101,218.0528 +gfx938,f8_w8a8_block,torch.float16,18,128,7168,256,8,0,0,asm,10001+20101,226.9306 +gfx938,f8_w8a8_block,torch.float16,20,128,7168,256,8,0,0,asm,10006+20101,249.3592 +gfx938,f8_w8a8_block,torch.float16,24,128,7168,256,8,0,0,asm,10006+20101,273.2813 +gfx938,f8_w8a8_block,torch.float16,28,128,7168,256,8,0,0,asm,10002+20101,326.4696 +gfx938,f8_w8a8_block,torch.float16,32,128,7168,256,8,0,0,asm,10001+20101,334.1086 +gfx938,f8_w8a8_block,torch.float16,34,128,7168,256,8,0,0,asm,10002+20101,322.4625 +gfx938,f8_w8a8_block,torch.float16,36,128,7168,256,8,0,0,asm,10002+20101,332.4153 +gfx938,f8_w8a8_block,torch.float16,40,128,7168,256,8,0,0,asm,10005+20101,359.0573 +gfx938,f8_w8a8_block,torch.float16,44,128,7168,256,8,0,0,asm,10005+20101,371.3075 +gfx938,f8_w8a8_block,torch.float16,48,128,7168,256,8,0,0,asm,10006+20101,377.0101 +gfx938,f8_w8a8_block,torch.float16,56,128,7168,256,8,0,0,asm,10002+20101,386.4541 +gfx938,f8_w8a8_block,torch.float16,64,128,7168,256,8,0,0,asm,10002+20101,414.1463 +gfx938,f8_w8a8_block,torch.float16,68,128,7168,256,8,0,0,asm,10002+20101,423.1842 +gfx938,f8_w8a8_block,torch.float16,72,128,7168,256,8,0,0,asm,10002+20100,456.7184 +gfx938,f8_w8a8_block,torch.float16,80,128,7168,256,8,0,0,asm,10007+20101,448.1646 +gfx938,f8_w8a8_block,torch.float16,88,128,7168,256,8,0,0,asm,10002+20101,448.0963 +gfx938,f8_w8a8_block,torch.float16,96,128,7168,256,8,0,0,asm,10002+20101,447.221 +gfx938,f8_w8a8_block,torch.float16,104,128,7168,256,8,0,0,asm,10001+20101,459.5944 +gfx938,f8_w8a8_block,torch.float16,112,128,7168,256,8,0,0,asm,10001+20101,457.5964 +gfx938,f8_w8a8_block,torch.float16,128,128,7168,256,8,0,0,asm,10002+20101,465.9861 +gfx938,f8_w8a8_block,torch.float16,144,128,7168,256,8,0,0,asm,10002+20101,487.6434 +gfx938,f8_w8a8_block,torch.float16,160,128,7168,256,8,0,0,asm,10002+20101,493.5511 +gfx938,f8_w8a8_block,torch.float16,192,128,7168,256,8,0,0,asm,10001+20101,501.9257 +gfx938,f8_w8a8_block,torch.float16,224,128,7168,256,8,0,0,asm,10005+20101,507.8977 +gfx938,f8_w8a8_block,torch.float16,256,128,7168,256,8,0,0,asm,10002+20101,511.9018 +gfx938,f8_w8a8_block,torch.float16,320,128,7168,256,8,0,0,asm,10005+20101,532.5335 +gfx938,f8_w8a8_block,torch.float16,384,128,7168,256,8,0,0,asm,10002+20101,538.9704 +gfx938,f8_w8a8_block,torch.float16,448,128,7168,256,8,0,0,asm,10002+20101,565.112 +gfx938,f8_w8a8_block,torch.float16,512,128,7168,256,8,0,0,asm,10006+20101,588.5628 +gfx938,f8_w8a8_block,torch.float16,576,128,7168,256,8,0,0,asm,11010+21101,614.6829 +gfx938,f8_w8a8_block,torch.float16,640,128,7168,256,8,0,0,asm,11010+21101,628.6065 +gfx938,f8_w8a8_block,torch.float16,704,128,7168,256,8,0,0,asm,11010+21101,634.5557 +gfx938,f8_w8a8_block,torch.float16,768,128,7168,256,8,0,0,asm,11010+21101,650.5685 +gfx938,f8_w8a8_block,torch.float16,832,128,7168,256,8,0,0,asm,11010+21101,657.2756 +gfx938,f8_w8a8_block,torch.float16,896,128,7168,256,8,0,0,asm,11006+21101,696.1905 +gfx938,f8_w8a8_block,torch.float16,960,128,7168,256,8,0,0,asm,11010+21101,708.2019 +gfx938,f8_w8a8_block,torch.float16,1024,128,7168,256,8,0,0,asm,11010+21101,757.7168 +gfx938,f8_w8a8_block,torch.float16,1152,128,7168,256,8,0,0,asm,11010+21101,829.2878 +gfx938,f8_w8a8_block,torch.float16,1280,128,7168,256,8,0,0,asm,11010+21101,868.635 +gfx938,f8_w8a8_block,torch.float16,1408,128,7168,256,8,0,0,asm,11010+21101,899.1951 +gfx938,f8_w8a8_block,torch.float16,1536,128,7168,256,8,0,0,asm,11010+21101,975.7805 +gfx938,f8_w8a8_block,torch.float16,1664,128,7168,256,8,0,0,asm,11010+21101,996.4617 +gfx938,f8_w8a8_block,torch.float16,1792,128,7168,256,8,0,0,asm,11010+21101,1030.6891 +gfx938,f8_w8a8_block,torch.float16,1920,128,7168,256,8,0,0,asm,11010+21101,1101.5501 +gfx938,f8_w8a8_block,torch.float16,2048,128,7168,256,8,0,0,asm,11010+21101,1145.2352 +gfx938,f8_w8a8_block,torch.float16,2304,128,7168,256,8,0,0,asm,11010+21101,1308.0337 +gfx938,f8_w8a8_block,torch.float16,2560,128,7168,256,8,0,0,asm,11010+21101,1375.4024 +gfx938,f8_w8a8_block,torch.float16,2816,128,7168,256,8,0,0,asm,11010+21101,1478.738 +gfx938,f8_w8a8_block,torch.float16,3072,128,7168,256,8,0,0,asm,11010+21101,1602.2884 +gfx938,f8_w8a8_block,torch.float16,3328,128,7168,256,8,0,0,asm,11010+21101,1727.6481 +gfx938,f8_w8a8_block,torch.float16,3584,128,7168,256,8,0,0,asm,11010+21101,1849.196 +gfx938,f8_w8a8_block,torch.float16,3840,128,7168,256,8,0,0,asm,11010+21101,1911.3202 +gfx938,f8_w8a8_block,torch.float16,4096,128,7168,256,8,0,0,asm,11010+21101,2019.8641 +gfx938,f8_w8a8_block,torch.float16,4608,128,7168,256,8,0,0,asm,11010+21101,2259.4566 +gfx938,f8_w8a8_block,torch.float16,5120,128,7168,256,8,0,0,asm,11010+21101,2484.118 +gfx938,f8_w8a8_block,torch.float16,5632,128,7168,256,8,0,0,asm,11010+21101,2722.4067 +gfx938,f8_w8a8_block,torch.float16,6144,128,7168,256,8,0,0,asm,11010+21101,2944.3125 +gfx938,f8_w8a8_block,torch.float16,6656,128,7168,256,8,0,0,asm,11010+21101,3141.9019 +gfx938,f8_w8a8_block,torch.float16,7168,128,7168,256,8,0,0,asm,11010+21101,3375.1217 +gfx938,f8_w8a8_block,torch.float16,7680,128,7168,256,8,0,0,asm,11010+21101,3597.4721 +gfx938,f8_w8a8_block,torch.float16,8192,128,7168,256,8,0,0,asm,11010+21101,3816.3838 +gfx938,f8_w8a8_block,torch.float16,10240,128,7168,256,8,0,0,asm,11010+21101,4706.2116 +gfx938,f8_w8a8_block,torch.float16,12288,128,7168,256,8,0,0,asm,11010+21101,5647.2745 +gfx938,f8_w8a8_block,torch.float16,14336,128,7168,256,8,0,0,asm,11010+21101,6490.2357 +gfx938,f8_w8a8_block,torch.float16,16384,128,7168,256,8,0,0,asm,11010+21101,7346.2153 +gfx938,f8_w8a8_block,torch.float16,17408,128,7168,256,8,0,0,asm,11010+21101,7819.7326 +gfx938,f8_w8a8_block,torch.float16,24576,128,7168,256,8,0,0,asm,11010+21101,11024.8311 +gfx938,f8_w8a8_block,torch.float16,32768,128,7168,256,8,0,0,asm,11010+21101,14518.6201 +gfx938,f8_w8a8_block,torch.float16,40960,128,7168,256,8,0,0,asm,13001+23101,14697.0981 +gfx938,f8_w8a8_block,torch.float16,49152,128,7168,256,8,0,0,asm,11010+21101,20838.8615 +gfx938,f8_w8a8_block,torch.float16,57344,128,7168,256,8,0,0,asm,11010+21101,24568.6648 +gfx938,f8_w8a8_block,torch.float16,65536,128,7168,256,8,0,0,asm,13001+23000,28256.1414 +gfx938,f8_w8a8_block,torch.float16,1,256,7168,256,8,0,0,asm,10007+20000,72.6803 +gfx938,f8_w8a8_block,torch.float16,2,256,7168,256,8,0,0,asm,10002+20000,103.4148 +gfx938,f8_w8a8_block,torch.float16,3,256,7168,256,8,0,0,asm,10005+20000,134.748 +gfx938,f8_w8a8_block,torch.float16,4,256,7168,256,8,0,0,asm,10006+20000,166.8146 +gfx938,f8_w8a8_block,torch.float16,5,256,7168,256,8,0,0,asm,10001+20000,199.2455 +gfx938,f8_w8a8_block,torch.float16,6,256,7168,256,8,0,0,asm,10002+20000,209.641 +gfx938,f8_w8a8_block,torch.float16,7,256,7168,256,8,0,0,asm,10002+20000,225.1832 +gfx938,f8_w8a8_block,torch.float16,8,256,7168,256,8,0,0,asm,10006+20000,240.6587 +gfx938,f8_w8a8_block,torch.float16,9,256,7168,256,8,0,0,asm,10005+20000,259.4617 +gfx938,f8_w8a8_block,torch.float16,10,256,7168,256,8,0,0,asm,10006+20000,275.2421 +gfx938,f8_w8a8_block,torch.float16,11,256,7168,256,8,0,0,asm,10002+20000,312.0341 +gfx938,f8_w8a8_block,torch.float16,12,256,7168,256,8,0,0,asm,10002+20000,309.8362 +gfx938,f8_w8a8_block,torch.float16,13,256,7168,256,8,0,0,asm,10002+20000,321.2168 +gfx938,f8_w8a8_block,torch.float16,14,256,7168,256,8,0,0,asm,10006+20000,342.3694 +gfx938,f8_w8a8_block,torch.float16,15,256,7168,256,8,0,0,asm,10006+20000,352.1916 +gfx938,f8_w8a8_block,torch.float16,16,256,7168,256,8,0,0,asm,10006+20000,359.8449 +gfx938,f8_w8a8_block,torch.float16,17,256,7168,256,8,0,0,asm,10006+20000,365.372 +gfx938,f8_w8a8_block,torch.float16,18,256,7168,256,8,0,0,asm,10006+20000,378.5073 +gfx938,f8_w8a8_block,torch.float16,20,256,7168,256,8,0,0,asm,10002+20000,408.3221 +gfx938,f8_w8a8_block,torch.float16,24,256,7168,256,8,0,0,asm,10006+20000,459.4637 +gfx938,f8_w8a8_block,torch.float16,28,256,7168,256,8,0,0,asm,10002+20000,521.102 +gfx938,f8_w8a8_block,torch.float16,32,256,7168,256,8,0,0,asm,10006+20000,554.6966 +gfx938,f8_w8a8_block,torch.float16,34,256,7168,256,8,0,0,asm,10006+20000,551.9878 +gfx938,f8_w8a8_block,torch.float16,36,256,7168,256,8,0,0,asm,10002+20000,564.9833 +gfx938,f8_w8a8_block,torch.float16,40,256,7168,256,8,0,0,asm,10002+20000,595.1432 +gfx938,f8_w8a8_block,torch.float16,44,256,7168,256,8,0,0,asm,10002+20000,619.5608 +gfx938,f8_w8a8_block,torch.float16,48,256,7168,256,8,0,0,asm,10002+20000,628.6184 +gfx938,f8_w8a8_block,torch.float16,56,256,7168,256,8,0,0,asm,10002+20000,662.9176 +gfx938,f8_w8a8_block,torch.float16,64,256,7168,256,8,0,0,asm,10002+20000,687.9598 +gfx938,f8_w8a8_block,torch.float16,68,256,7168,256,8,0,0,asm,10002+20000,689.9695 +gfx938,f8_w8a8_block,torch.float16,72,256,7168,256,8,0,0,asm,10002+20000,711.1765 +gfx938,f8_w8a8_block,torch.float16,80,256,7168,256,8,0,0,asm,10002+20000,726.5693 +gfx938,f8_w8a8_block,torch.float16,88,256,7168,256,8,0,0,asm,10002+20000,754.9446 +gfx938,f8_w8a8_block,torch.float16,96,256,7168,256,8,0,0,asm,10002+20000,757.3035 +gfx938,f8_w8a8_block,torch.float16,104,256,7168,256,8,0,0,asm,10002+20000,776.5309 +gfx938,f8_w8a8_block,torch.float16,112,256,7168,256,8,0,0,asm,10002+20000,779.1336 +gfx938,f8_w8a8_block,torch.float16,128,256,7168,256,8,0,0,asm,10002+20000,794.9824 +gfx938,f8_w8a8_block,torch.float16,144,256,7168,256,8,0,0,asm,10002+20000,802.5377 +gfx938,f8_w8a8_block,torch.float16,160,256,7168,256,8,0,0,asm,10002+20000,810.6245 +gfx938,f8_w8a8_block,torch.float16,192,256,7168,256,8,0,0,asm,10001+20000,822.7867 +gfx938,f8_w8a8_block,torch.float16,224,256,7168,256,8,0,0,asm,10002+20000,831.9725 +gfx938,f8_w8a8_block,torch.float16,256,256,7168,256,8,0,0,asm,10002+20000,842.2666 +gfx938,f8_w8a8_block,torch.float16,320,256,7168,256,8,0,0,asm,10002+20000,873.6665 +gfx938,f8_w8a8_block,torch.float16,384,256,7168,256,8,0,0,asm,10006+20000,905.6093 +gfx938,f8_w8a8_block,torch.float16,448,256,7168,256,8,0,0,asm,11006+21000,940.6409 +gfx938,f8_w8a8_block,torch.float16,512,256,7168,256,8,0,0,asm,11006+21000,955.3874 +gfx938,f8_w8a8_block,torch.float16,576,256,7168,256,8,0,0,asm,11006+21000,970.2529 +gfx938,f8_w8a8_block,torch.float16,640,256,7168,256,8,0,0,asm,11006+21000,985.9352 +gfx938,f8_w8a8_block,torch.float16,704,256,7168,256,8,0,0,asm,11006+21000,1008.4660 +gfx938,f8_w8a8_block,torch.float16,768,256,7168,256,8,0,0,asm,11010+21000,1026.7544 +gfx938,f8_w8a8_block,torch.float16,832,256,7168,256,8,0,0,asm,11010+21000,1039.0986 +gfx938,f8_w8a8_block,torch.float16,896,256,7168,256,8,0,0,asm,11010+21000,1083.3307 +gfx938,f8_w8a8_block,torch.float16,960,256,7168,256,8,0,0,asm,11010+21000,1143.5322 +gfx938,f8_w8a8_block,torch.float16,1024,256,7168,256,8,0,0,asm,11010+21000,1199.3986 +gfx938,f8_w8a8_block,torch.float16,1152,256,7168,256,8,0,0,asm,11010+21000,1338.1607 +gfx938,f8_w8a8_block,torch.float16,1280,256,7168,256,8,0,0,asm,12003+22000,1385.7713 +gfx938,f8_w8a8_block,torch.float16,1408,256,7168,256,8,0,0,asm,12003+22000,1423.7195 +gfx938,f8_w8a8_block,torch.float16,1536,256,7168,256,8,0,0,asm,12003+22000,1453.7754 +gfx938,f8_w8a8_block,torch.float16,1664,256,7168,256,8,0,0,asm,12003+22000,1476.8312 +gfx938,f8_w8a8_block,torch.float16,1792,256,7168,256,8,0,0,asm,12003+22000,1577.4284 +gfx938,f8_w8a8_block,torch.float16,1920,256,7168,256,8,0,0,asm,12003+22000,1625.2879 +gfx938,f8_w8a8_block,torch.float16,2048,256,7168,256,8,0,0,asm,12003+22000,1835.2787 +gfx938,f8_w8a8_block,torch.float16,2304,256,7168,256,8,0,0,asm,11010+21000,2116.7298 +gfx938,f8_w8a8_block,torch.float16,2560,256,7168,256,8,0,0,asm,11010+21000,2245.6297 +gfx938,f8_w8a8_block,torch.float16,2816,256,7168,256,8,0,0,asm,11010+21000,2377.2784 +gfx938,f8_w8a8_block,torch.float16,3072,256,7168,256,8,0,0,asm,13001+23000,2465.8043 +gfx938,f8_w8a8_block,torch.float16,3328,256,7168,256,8,0,0,asm,13001+23000,2540.153 +gfx938,f8_w8a8_block,torch.float16,3584,256,7168,256,8,0,0,asm,13001+23000,2627.0902 +gfx938,f8_w8a8_block,torch.float16,3840,256,7168,256,8,0,0,asm,12003+22000,2800.8796 +gfx938,f8_w8a8_block,torch.float16,4096,256,7168,256,8,0,0,asm,12003+22000,3084.2312 +gfx938,f8_w8a8_block,torch.float16,4608,256,7168,256,8,0,0,asm,12003+22000,3563.8115 +gfx938,f8_w8a8_block,torch.float16,5120,256,7168,256,8,0,0,asm,12003+22000,3755.4826 +gfx938,f8_w8a8_block,torch.float16,5632,256,7168,256,8,0,0,asm,12003+22000,3950.6285 +gfx938,f8_w8a8_block,torch.float16,6144,256,7168,256,8,0,0,asm,12003+22000,4354.9376 +gfx938,f8_w8a8_block,torch.float16,6656,256,7168,256,8,0,0,asm,13001+23000,4595.7098 +gfx938,f8_w8a8_block,torch.float16,7168,256,7168,256,8,0,0,asm,13001+23000,4714.1942 +gfx938,f8_w8a8_block,torch.float16,7680,256,7168,256,8,0,0,asm,13001+23000,5001.5109 +gfx938,f8_w8a8_block,torch.float16,8192,256,7168,256,8,0,0,asm,13001+23000,5565.8363 +gfx938,f8_w8a8_block,torch.float16,10240,256,7168,256,8,0,0,asm,13001+23000,6804.4078 +gfx938,f8_w8a8_block,torch.float16,12288,256,7168,256,8,0,0,asm,13001+23000,7889.1483 +gfx938,f8_w8a8_block,torch.float16,14336,256,7168,256,8,0,0,asm,13001+23000,9148.5336 +gfx938,f8_w8a8_block,torch.float16,16384,256,7168,256,8,0,0,asm,13001+23000,10246.3196 +gfx938,f8_w8a8_block,torch.float16,17408,256,7168,256,8,0,0,asm,13001+23000,11124.6754 +gfx938,f8_w8a8_block,torch.float16,24576,256,7168,256,8,0,0,asm,13001+23000,14963.2569 +gfx938,f8_w8a8_block,torch.float16,32768,256,7168,256,8,0,0,asm,13001+23000,19704.3071 +gfx938,f8_w8a8_block,torch.float16,40960,256,7168,256,8,0,0,asm,13001+23000,23143.0651 +gfx938,f8_w8a8_block,torch.float16,49152,256,7168,256,8,0,0,asm,13001+23000,27894.5593 +gfx938,f8_w8a8_block,torch.float16,57344,256,7168,256,8,0,0,asm,13001+23000,32751.0668 +gfx938,f8_w8a8_block,torch.float16,65536,256,7168,256,8,0,0,asm,13001+23000,37602.0424 +gfx938,f8_w8a8_block,torch.float16,1,512,7168,256,8,0,0,asm,10001+20000,98.9608 +gfx938,f8_w8a8_block,torch.float16,2,512,7168,256,8,0,0,asm,10005+20000,148.7373 +gfx938,f8_w8a8_block,torch.float16,4,512,7168,256,8,0,0,asm,10002+20000,255.7451 +gfx938,f8_w8a8_block,torch.float16,6,512,7168,256,8,0,0,asm,10006+20000,331.7556 +gfx938,f8_w8a8_block,torch.float16,8,512,7168,256,8,0,0,asm,10001+20000,402.7571 +gfx938,f8_w8a8_block,torch.float16,16,512,7168,256,8,0,0,asm,10002+20000,620.9451 +gfx938,f8_w8a8_block,torch.float16,24,512,7168,256,8,0,0,asm,10002+20000,813.7066 +gfx938,f8_w8a8_block,torch.float16,32,512,7168,256,8,0,0,asm,10001+20000,977.7223 +gfx938,f8_w8a8_block,torch.float16,48,512,7168,256,8,0,0,asm,10002+20000,1144.5069 +gfx938,f8_w8a8_block,torch.float16,56,512,7168,256,8,0,0,asm,10001+20000,1205.4824 +gfx938,f8_w8a8_block,torch.float16,64,512,7168,256,8,0,0,asm,10002+20000,1248.6146 +gfx938,f8_w8a8_block,torch.float16,68,512,7168,256,8,0,0,asm,10002+20000,1261.4766 +gfx938,f8_w8a8_block,torch.float16,72,512,7168,256,8,0,0,asm,10001+20000,1295.1417 +gfx938,f8_w8a8_block,torch.float16,80,512,7168,256,8,0,0,asm,10002+20000,1328.6098 +gfx938,f8_w8a8_block,torch.float16,88,512,7168,256,8,0,0,asm,10002+20000,1374.6385 +gfx938,f8_w8a8_block,torch.float16,90,512,7168,256,8,0,0,asm,11006+21000,1422.9087 +gfx938,f8_w8a8_block,torch.float16,96,512,7168,256,8,0,0,asm,10002+20000,1393.8895 +gfx938,f8_w8a8_block,torch.float16,100,512,7168,256,8,0,0,asm,10002+20000,1392.3682 +gfx938,f8_w8a8_block,torch.float16,112,512,7168,256,8,0,0,asm,10002+20000,1428.6178 +gfx938,f8_w8a8_block,torch.float16,128,512,7168,256,8,0,0,asm,10002+20000,1454.5895 +gfx938,f8_w8a8_block,torch.float16,160,512,7168,256,8,0,0,asm,10002+20000,1480.1684 +gfx938,f8_w8a8_block,torch.float16,224,512,7168,256,8,0,0,asm,10002+20000,1509.5907 +gfx938,f8_w8a8_block,torch.float16,256,512,7168,256,8,0,0,asm,10002+20000,1523.7349 +gfx938,f8_w8a8_block,torch.float16,384,512,7168,256,8,0,0,asm,11005+21000,1591.8786 +gfx938,f8_w8a8_block,torch.float16,512,512,7168,256,8,0,0,asm,11006+21000,1629.0737 +gfx938,f8_w8a8_block,torch.float16,640,512,7168,256,8,0,0,asm,11006+21000,1673.5754 +gfx938,f8_w8a8_block,torch.float16,768,512,7168,256,8,0,0,asm,11010+21000,1724.7656 +gfx938,f8_w8a8_block,torch.float16,1024,512,7168,256,8,0,0,asm,11010+21000,1932.1925 +gfx938,f8_w8a8_block,torch.float16,1536,512,7168,256,8,0,0,asm,12003+22000,2149.6496 +gfx938,f8_w8a8_block,torch.float16,2048,512,7168,256,8,0,0,asm,12003+22000,2760.8171 +gfx938,f8_w8a8_block,torch.float16,3072,512,7168,256,8,0,0,asm,13001+23000,3618.1797 +gfx938,f8_w8a8_block,torch.float16,4096,512,7168,256,8,0,0,asm,12003+22000,4654.0263 +gfx938,f8_w8a8_block,torch.float16,6144,512,7168,256,8,0,0,asm,12003+22000,6558.0078 +gfx938,f8_w8a8_block,torch.float16,7168,512,7168,256,8,0,0,asm,13001+23000,7055.2258 +gfx938,f8_w8a8_block,torch.float16,7680,512,7168,256,8,0,0,asm,13001+23000,7456.4795 +gfx938,f8_w8a8_block,torch.float16,8192,512,7168,256,8,0,0,asm,13001+23000,8370.6686 +gfx938,f8_w8a8_block,torch.float16,12288,512,7168,256,8,0,0,asm,13001+23000,11901.9833 +gfx938,f8_w8a8_block,torch.float16,16384,512,7168,256,8,0,0,asm,13001+23000,15240.726 +gfx938,f8_w8a8_block,torch.float16,1,1024,7168,256,8,0,0,asm,10006+20000,151.2889 +gfx938,f8_w8a8_block,torch.float16,2,1024,7168,256,8,0,0,asm,10006+20000,237.6805 +gfx938,f8_w8a8_block,torch.float16,4,1024,7168,256,8,0,0,asm,10006+20000,429.1105 +gfx938,f8_w8a8_block,torch.float16,6,1024,7168,256,8,0,0,asm,10001+20000,578.6652 +gfx938,f8_w8a8_block,torch.float16,8,1024,7168,256,8,0,0,asm,10002+20000,704.0545 +gfx938,f8_w8a8_block,torch.float16,16,1024,7168,256,8,0,0,asm,10001+20000,1138.7775 +gfx938,f8_w8a8_block,torch.float16,24,1024,7168,256,8,0,0,asm,10001+20000,1531.4521 +gfx938,f8_w8a8_block,torch.float16,32,1024,7168,256,8,0,0,asm,10002+20000,1861.577 +gfx938,f8_w8a8_block,torch.float16,48,1024,7168,256,8,0,0,asm,10002+20000,2205.7991 +gfx938,f8_w8a8_block,torch.float16,56,1024,7168,256,8,0,0,asm,10002+20000,2319.8525 +gfx938,f8_w8a8_block,torch.float16,64,1024,7168,256,8,0,0,asm,10001+20000,2411.5447 +gfx938,f8_w8a8_block,torch.float16,68,1024,7168,256,8,0,0,asm,10002+20000,2432.3937 +gfx938,f8_w8a8_block,torch.float16,72,1024,7168,256,8,0,0,asm,10002+20000,2542.6718 +gfx938,f8_w8a8_block,torch.float16,80,1024,7168,256,8,0,0,asm,11005+21000,2595.5271 +gfx938,f8_w8a8_block,torch.float16,88,1024,7168,256,8,0,0,asm,10001+20000,2651.8934 +gfx938,f8_w8a8_block,torch.float16,90,1024,7168,256,8,0,0,asm,10002+20000,2716.6178 +gfx938,f8_w8a8_block,torch.float16,96,1024,7168,256,8,0,0,asm,10002+20000,2688.2667 +gfx938,f8_w8a8_block,torch.float16,100,1024,7168,256,8,0,0,asm,10002+20000,2684.4469 +gfx938,f8_w8a8_block,torch.float16,112,1024,7168,256,8,0,0,asm,10002+20000,2754.0411 +gfx938,f8_w8a8_block,torch.float16,128,1024,7168,256,8,0,0,asm,10002+20000,2795.1093 +gfx938,f8_w8a8_block,torch.float16,160,1024,7168,256,8,0,0,asm,10001+20000,2857.7264 +gfx938,f8_w8a8_block,torch.float16,224,1024,7168,256,8,0,0,asm,10001+20000,2896.1117 +gfx938,f8_w8a8_block,torch.float16,256,1024,7168,256,8,0,0,asm,10002+20000,2914.3146 +gfx938,f8_w8a8_block,torch.float16,384,1024,7168,256,8,0,0,asm,11005+21000,2997.6185 +gfx938,f8_w8a8_block,torch.float16,512,1024,7168,256,8,0,0,asm,11010+21000,3053.1762 +gfx938,f8_w8a8_block,torch.float16,640,1024,7168,256,8,0,0,asm,11010+21000,3092.1409 +gfx938,f8_w8a8_block,torch.float16,768,1024,7168,256,8,0,0,asm,11010+21000,3186.2115 +gfx938,f8_w8a8_block,torch.float16,1024,1024,7168,256,8,0,0,asm,12003+22000,3407.8668 +gfx938,f8_w8a8_block,torch.float16,1536,1024,7168,256,8,0,0,asm,12003+22000,3576.4024 +gfx938,f8_w8a8_block,torch.float16,2048,1024,7168,256,8,0,0,asm,12003+22000,4605.3776 +gfx938,f8_w8a8_block,torch.float16,3072,1024,7168,256,8,0,0,asm,13001+23000,5888.471 +gfx938,f8_w8a8_block,torch.float16,4096,1024,7168,256,8,0,0,asm,12003+22000,7722.4229 +gfx938,f8_w8a8_block,torch.float16,6144,1024,7168,256,8,0,0,asm,12003+22000,10916.0147 +gfx938,f8_w8a8_block,torch.float16,7168,1024,7168,256,8,0,0,asm,13001+23000,11507.4329 +gfx938,f8_w8a8_block,torch.float16,7680,1024,7168,256,8,0,0,asm,13001+23000,12128.7707 +gfx938,f8_w8a8_block,torch.float16,8192,1024,7168,256,8,0,0,asm,13001+23000,13783.3974 +gfx938,f8_w8a8_block,torch.float16,12288,1024,7168,256,8,0,0,asm,13001+23000,19406.7175 +gfx938,f8_w8a8_block,torch.float16,16384,1024,7168,256,8,0,0,asm,13001+23000,25044.1644 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,256,8,0,0,asm,10002+20000,251.9628 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,256,8,0,0,asm,10006+20000,427.6089 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,256,8,0,0,asm,10001+20000,771.2179 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,256,8,0,0,asm,10001+20000,1070.6738 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,256,8,0,0,asm,10001+20000,1326.8262 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,256,8,0,0,asm,10001+20000,2202.1582 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,256,8,0,0,asm,10001+20000,3013.2885 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,256,8,0,0,asm,10001+20000,3641.0618 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,256,8,0,0,asm,10001+20000,4355.4914 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,256,8,0,0,asm,10001+20000,4593.266 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,256,8,0,0,asm,10001+20000,4767.8612 +gfx938,f8_w8a8_block,torch.float16,68,2048,7168,256,8,0,0,asm,10001+20000,4860.9942 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,256,8,0,0,asm,10001+20000,4949.9584 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,256,8,0,0,asm,10001+20000,5088.8829 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,256,8,0,0,asm,11006+21000,5266.996 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,256,8,0,0,asm,10001+20000,5320.5549 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,256,8,0,0,asm,10001+20000,5308.1328 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,256,8,0,0,asm,10001+20000,5316.2476 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,256,8,0,0,asm,10001+20000,5450.785 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,256,8,0,0,asm,10001+20000,5529.5102 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,256,8,0,0,asm,10001+20000,5638.9213 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,256,8,0,0,asm,10001+20000,5691.9585 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,256,8,0,0,asm,10001+20000,5716.8708 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,256,8,0,0,asm,11010+21000,5836.1793 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,256,8,0,0,asm,11010+21000,5915.5616 +gfx938,f8_w8a8_block,torch.float16,640,2048,7168,256,8,0,0,asm,11010+21000,5985.1953 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,256,8,0,0,asm,12003+22000,6111.1721 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,256,8,0,0,asm,12003+22000,6233.1449 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,256,8,0,0,asm,12003+22000,6516.6345 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,256,8,0,0,asm,12003+22000,8578.0631 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,256,8,0,0,asm,13001+23000,10717.5206 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,256,8,0,0,asm,12003+22000,14547.0352 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,256,8,0,0,asm,12003+22000,19927.8669 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,256,8,0,0,asm,13001+23000,21524.7389 +gfx938,f8_w8a8_block,torch.float16,7680,2048,7168,256,8,0,0,asm,13001+23000,22661.5994 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,256,8,0,0,asm,13001+23000,25611.2774 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,256,8,0,0,asm,13001+23000,36065.8213 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,256,8,0,0,asm,13001+23000,46297.6444 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,288,8,0,0,asm,10006+20000,252.0489 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,288,8,0,0,asm,10006+20000,430.5957 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,288,8,0,0,asm,10002+20000,756.1026 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,288,8,0,0,asm,10001+20000,1069.1647 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,288,8,0,0,asm,10001+20000,1306.9971 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,288,8,0,0,asm,10001+20000,2284.3784 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,288,8,0,0,asm,10001+20000,3099.9363 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,288,8,0,0,asm,10001+20000,3767.9718 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,288,8,0,0,asm,10001+20000,4620.3121 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,288,8,0,0,asm,10001+20000,4939.5057 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,288,8,0,0,asm,10001+20000,5129.1815 +gfx938,f8_w8a8_block,torch.float16,68,2048,7168,288,8,0,0,asm,10002+20000,5164.4877 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,288,8,0,0,asm,10001+20000,5308.4833 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,288,8,0,0,asm,10001+20000,5387.8599 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,288,8,0,0,asm,10001+20000,5502.0268 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,288,8,0,0,asm,10001+20000,5595.4265 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,288,8,0,0,asm,10001+20000,5631.7624 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,288,8,0,0,asm,10001+20000,5679.1349 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,288,8,0,0,asm,10001+20000,5931.2481 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,288,8,0,0,asm,10001+20000,6013.378 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,288,8,0,0,asm,10001+20000,6180.9942 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,288,8,0,0,asm,10001+20000,6371.7133 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,288,8,0,0,asm,10001+20000,6398.0948 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,288,8,0,0,asm,11010+21000,6508.5766 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,288,8,0,0,asm,11010+21000,6613.3309 +gfx938,f8_w8a8_block,torch.float16,640,2048,7168,288,8,0,0,asm,11010+21000,6681.1973 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,288,8,0,0,asm,12003+22000,6787.456 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,288,8,0,0,asm,12003+22000,6932.4909 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,288,8,0,0,asm,12002+22000,7172.1113 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,288,8,0,0,asm,12003+22000,8085.1632 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,288,8,0,0,asm,13001+23000,11887.6732 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,288,8,0,0,asm,13001+23000,13355.7484 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,288,8,0,0,asm,12003+22000,19758.7502 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,288,8,0,0,asm,12003+22000,23482.3928 +gfx938,f8_w8a8_block,torch.float16,7680,2048,7168,288,8,0,0,asm,13001+23000,23931.7054 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,288,8,0,0,asm,13001+23000,24374.1711 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,288,8,0,0,asm,13001+23000,34948.4318 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,288,8,0,0,asm,13001+23000,46757.3991 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,36,8,0,0,asm,10006+20000,242.5053 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,36,8,0,0,asm,10002+20000,379.8434 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,36,8,0,0,asm,10002+20000,568.874 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,36,8,0,0,asm,10002+20000,717.4164 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,36,8,0,0,asm,10002+20000,759.7488 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,36,8,0,0,asm,10002+20000,831.6938 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,36,8,0,0,asm,10001+20000,865.2568 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,36,8,0,0,asm,10002+20000,866.5272 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,36,8,0,0,asm,11010+21000,892.3441 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,36,8,0,0,asm,11010+21000,899.0294 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,36,8,0,0,asm,11010+21000,909.4108 +gfx938,f8_w8a8_block,torch.float16,68,2048,7168,36,8,0,0,asm,11010+21000,907.7949 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,36,8,0,0,asm,11009+21000,984.5952 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,36,8,0,0,asm,11010+21000,906.8205 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,36,8,0,0,asm,11010+21000,913.0271 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,36,8,0,0,asm,11010+21000,910.8481 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,36,8,0,0,asm,11010+21000,914.0648 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,36,8,0,0,asm,11010+21000,916.0669 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,36,8,0,0,asm,11010+21000,944.7542 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,36,8,0,0,asm,12003+22000,981.2716 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,36,8,0,0,asm,12003+22000,976.5842 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,36,8,0,0,asm,12003+22000,1046.3545 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,36,8,0,0,asm,12003+22000,1117.3637 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,36,8,0,0,asm,13001+23000,1584.1359 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,36,8,0,0,asm,12003+22000,1730.4226 +gfx938,f8_w8a8_block,torch.float16,640,2048,7168,36,8,0,0,asm,12003+22000,2364.2128 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,36,8,0,0,asm,12003+22000,2495.0365 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,36,8,0,0,asm,13001+23000,2967.159 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,36,8,0,0,asm,13001+23000,4397.4882 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,36,8,0,0,asm,13001+23000,5729.3881 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,36,8,0,0,asm,13001+23000,8545.7815 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,36,8,0,0,asm,13001+23000,11050.4888 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,36,8,0,0,asm,13001+23000,15905.5158 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,36,8,0,0,asm,13001+23000,18483.3557 +gfx938,f8_w8a8_block,torch.float16,7680,2048,7168,36,8,0,0,asm,13001+23000,19818.6245 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,36,8,0,0,asm,13001+23000,21101.9132 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,36,8,0,0,asm,13001+23000,31240.5371 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,36,8,0,0,asm,13001+23000,41174.3595 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,18,8,0,0,asm,10006+20000,253.2057 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,18,8,0,0,asm,10006+20000,340.1796 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,18,8,0,0,asm,10001+20000,450.665 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,18,8,0,0,asm,10002+20000,451.4404 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,18,8,0,0,asm,10002+20000,471.0792 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,18,8,0,0,asm,10001+20000,478.6772 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,18,8,0,0,asm,11010+21000,501.0289 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,18,8,0,0,asm,11010+21000,498.9725 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,18,8,0,0,asm,11007+21000,517.3899 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,18,8,0,0,asm,11010+21000,520.8237 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,18,8,0,0,asm,12003+22000,542.7262 +gfx938,f8_w8a8_block,torch.float16,68,2048,7168,18,8,0,0,asm,12003+22000,596.5075 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,18,8,0,0,asm,12001+22000,612.4255 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,18,8,0,0,asm,12003+22000,551.5192 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,18,8,0,0,asm,12003+22000,552.1879 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,18,8,0,0,asm,12003+22000,560.6104 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,18,8,0,0,asm,12003+22000,556.0361 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,18,8,0,0,asm,12003+22000,572.2411 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,18,8,0,0,asm,12003+22000,563.2689 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,18,8,0,0,asm,12003+22000,624.3464 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,18,8,0,0,asm,12003+22000,824.3752 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,18,8,0,0,asm,12003+22000,860.9125 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,18,8,0,0,asm,12003+22000,893.8342 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,18,8,0,0,asm,12003+22000,1281.9552 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,18,8,0,0,asm,13001+23000,1540.5314 +gfx938,f8_w8a8_block,torch.float16,640,2048,7168,18,8,0,0,asm,12003+22000,2034.2148 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,18,8,0,0,asm,13001+23000,2212.7575 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,18,8,0,0,asm,13001+23000,2898.641 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,18,8,0,0,asm,13001+23000,4266.8676 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,18,8,0,0,asm,13001+23000,5588.6124 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,18,8,0,0,asm,13001+23000,7867.0413 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,18,8,0,0,asm,13001+23000,10511.3056 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,18,8,0,0,asm,13001+23000,15436.545 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,18,8,0,0,asm,13001+23000,17827.8971 +gfx938,f8_w8a8_block,torch.float16,7680,2048,7168,18,8,0,0,asm,13001+23000,18981.7078 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,18,8,0,0,asm,13001+23000,20322.7869 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,18,8,0,0,asm,13001+23000,30253.3927 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,18,8,0,0,asm,13001+23000,40252.3459 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,12,8,0,0,asm,10002+20000,265.6776 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,12,8,0,0,asm,10001+20000,356.1058 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,12,8,0,0,asm,10002+20000,353.7674 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,12,8,0,0,asm,10006+20000,356.3274 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,12,8,0,0,asm,10001+20000,359.7653 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,12,8,0,0,asm,10001+20000,359.5027 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,12,8,0,0,asm,11009+21000,384.1683 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,12,8,0,0,asm,11010+21000,378.8381 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,12,8,0,0,asm,12003+22000,429.3415 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,12,8,0,0,asm,12003+22000,437.1896 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,12,8,0,0,asm,12003+22000,441.2582 +gfx938,f8_w8a8_block,torch.float16,68,2048,7168,12,8,0,0,asm,12003+22000,486.6129 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,12,8,0,0,asm,12000+22000,498.7031 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,12,8,0,0,asm,12003+22000,449.5136 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,12,8,0,0,asm,12003+22000,450.486 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,12,8,0,0,asm,12003+22000,480.373 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,12,8,0,0,asm,12003+22000,486.3176 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,12,8,0,0,asm,12003+22000,583.1007 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,12,8,0,0,asm,12003+22000,625.4102 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,12,8,0,0,asm,12003+22000,643.3043 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,12,8,0,0,asm,12003+22000,649.7136 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,12,8,0,0,asm,12003+22000,861.5113 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,12,8,0,0,asm,12003+22000,879.8491 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,12,8,0,0,asm,12003+22000,1276.9704 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,12,8,0,0,asm,13001+23000,1580.5419 +gfx938,f8_w8a8_block,torch.float16,640,2048,7168,12,8,0,0,asm,12003+22000,1933.2875 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,12,8,0,0,asm,13001+23000,2321.9867 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,12,8,0,0,asm,13001+23000,2891.5873 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,12,8,0,0,asm,13001+23000,4088.2241 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,12,8,0,0,asm,13001+23000,5294.1901 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,12,8,0,0,asm,13001+23000,7754.2657 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,12,8,0,0,asm,13001+23000,10316.9177 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,12,8,0,0,asm,13001+23000,15092.8758 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,12,8,0,0,asm,13001+23000,17581.5223 +gfx938,f8_w8a8_block,torch.float16,7680,2048,7168,12,8,0,0,asm,13001+23000,18825.2402 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,12,8,0,0,asm,13001+23000,20077.4043 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,12,8,0,0,asm,13001+23000,29951.1569 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,12,8,0,0,asm,13001+23000,39756.4384 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,32,8,0,0,asm,10002+20000,260.7162 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,32,8,0,0,asm,10002+20000,409.1621 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,32,8,0,0,asm,10006+20000,556.5805 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,32,8,0,0,asm,10002+20000,717.0262 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,32,8,0,0,asm,10002+20000,718.2809 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,32,8,0,0,asm,10002+20000,788.2175 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,32,8,0,0,asm,10002+20000,836.5512 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,32,8,0,0,asm,10002+20000,829.7501 +gfx938,f8_w8a8_block,torch.float16,36,2048,7168,32,8,0,0,asm,10002+20000,844.2227 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,32,8,0,0,asm,10006+20000,875.3385 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,32,8,0,0,asm,11007+21000,890.21 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,32,8,0,0,asm,11007+21000,891.0467 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,32,8,0,0,asm,11010+21000,906.4289 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,32,8,0,0,asm,11007+21000,891.9616 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,32,8,0,0,asm,11010+21000,897.7047 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,32,8,0,0,asm,11010+21000,895.4226 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,32,8,0,0,asm,11010+21000,900.5763 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,32,8,0,0,asm,11010+21000,905.6794 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,32,8,0,0,asm,11010+21000,912.9299 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,32,8,0,0,asm,12002+22000,971.4591 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,32,8,0,0,asm,12003+22000,968.2561 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,32,8,0,0,asm,12003+22000,1065.4767 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,32,8,0,0,asm,12003+22000,1180.9711 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,32,8,0,0,asm,13001+22000,1378.7978 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,32,8,0,0,asm,12005+22000,1771.5964 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,32,8,0,0,asm,12005+22000,2489.4297 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,32,8,0,0,asm,13001+23000,2843.6769 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,32,8,0,0,asm,13001+23000,4129.2134 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,32,8,0,0,asm,13001+23000,5286.1524 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,32,8,0,0,asm,13001+23000,7599.4159 +gfx938,f8_w8a8_block,torch.float16,3584,2048,7168,32,8,0,0,asm,13001+23000,8879.8913 +gfx938,f8_w8a8_block,torch.float16,3840,2048,7168,32,8,0,0,asm,13001+23000,9373.7589 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,32,8,0,0,asm,13001+23000,9922.2875 +gfx938,f8_w8a8_block,torch.float16,5120,2048,7168,32,8,0,0,asm,13001+23000,12321.6558 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,32,8,0,0,asm,13001+23000,14637.0245 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,32,8,0,0,asm,13001+23000,17064.6537 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,32,8,0,0,asm,13001+23000,19372.8049 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,32,8,0,0,asm,13001+23000,28622.759 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,32,8,0,0,asm,13001+23000,37843.5938 +gfx938,f8_w8a8_block,torch.float16,24576,2048,7168,32,8,0,0,asm,13001+23000,56428.1749 +gfx938,f8_w8a8_block,torch.float16,32768,2048,7168,32,8,0,0,asm,13001+23000,75020.0322 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,16,8,0,0,asm,10002+20000,283.4024 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,16,8,0,0,asm,10002+20000,386.6779 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,16,8,0,0,asm,10002+20000,431.1746 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,16,8,0,0,asm,10002+20000,450.8883 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,16,8,0,0,asm,10002+20000,468.6651 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,16,8,0,0,asm,10006+20000,468.2018 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,16,8,0,0,asm,10002+20000,511.6092 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,16,8,0,0,asm,11010+21000,490.2902 +gfx938,f8_w8a8_block,torch.float16,36,2048,7168,16,8,0,0,asm,11010+21000,500.5356 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,16,8,0,0,asm,11010+21000,516.4766 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,16,8,0,0,asm,11010+21000,518.5145 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,16,8,0,0,asm,12003+22001,568.0836 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,16,8,0,0,asm,12003+22002,599.7184 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,16,8,0,0,asm,12003+22000,545.9923 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,16,8,0,0,asm,12003+22000,577.1248 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,16,8,0,0,asm,12003+22000,548.5186 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,16,8,0,0,asm,12003+22000,582.6574 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,16,8,0,0,asm,12003+22000,554.8091 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,16,8,0,0,asm,12003+22000,588.729 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,16,8,0,0,asm,12003+22000,618.3487 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,16,8,0,0,asm,13001+22000,733.4864 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,16,8,0,0,asm,13001+22000,769.9915 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,16,8,0,0,asm,12005+22000,1012.8147 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,16,8,0,0,asm,12005+22000,1266.6075 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,16,8,0,0,asm,12005+22000,1651.9 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,16,8,0,0,asm,13001+22000,2209.8097 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,16,8,0,0,asm,13001+23000,2721.6816 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,16,8,0,0,asm,13001+23000,3930.9237 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,16,8,0,0,asm,13001+23000,5010.0442 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,16,8,0,0,asm,13001+23000,7328.1415 +gfx938,f8_w8a8_block,torch.float16,3584,2048,7168,16,8,0,0,asm,13001+23000,8457.5269 +gfx938,f8_w8a8_block,torch.float16,3840,2048,7168,16,8,0,0,asm,13001+23000,9092.6067 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,16,8,0,0,asm,13001+23000,9647.2743 +gfx938,f8_w8a8_block,torch.float16,5120,2048,7168,16,8,0,0,asm,13001+23000,11924.7904 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,16,8,0,0,asm,13001+23000,14222.8628 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,16,8,0,0,asm,13001+23000,16384.8763 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,16,8,0,0,asm,13001+23000,18712.2948 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,16,8,0,0,asm,13001+23000,27952.4493 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,16,8,0,0,asm,13001+23000,36869.1837 +gfx938,f8_w8a8_block,torch.float16,24576,2048,7168,16,8,0,0,asm,13001+23000,54914.6881 +gfx938,f8_w8a8_block,torch.float16,32768,2048,7168,16,8,0,0,asm,13001+23000,72948.9094 +gfx938,f8_w8a8_block,torch.float16,1,2048,7168,8,8,0,0,asm,10001+20000,267.6383 +gfx938,f8_w8a8_block,torch.float16,2,2048,7168,8,8,0,0,asm,10006+20000,270.9898 +gfx938,f8_w8a8_block,torch.float16,4,2048,7168,8,8,0,0,asm,10002+20000,270.8719 +gfx938,f8_w8a8_block,torch.float16,6,2048,7168,8,8,0,0,asm,10002+20000,280.5899 +gfx938,f8_w8a8_block,torch.float16,8,2048,7168,8,8,0,0,asm,10002+20000,283.0992 +gfx938,f8_w8a8_block,torch.float16,16,2048,7168,8,8,0,0,asm,10002+20000,286.2066 +gfx938,f8_w8a8_block,torch.float16,24,2048,7168,8,8,0,0,asm,11010+21000,303.6942 +gfx938,f8_w8a8_block,torch.float16,32,2048,7168,8,8,0,0,asm,11010+21000,312.3455 +gfx938,f8_w8a8_block,torch.float16,36,2048,7168,8,8,0,0,asm,12003+22000,317.6479 +gfx938,f8_w8a8_block,torch.float16,48,2048,7168,8,8,0,0,asm,12003+22002,328.1404 +gfx938,f8_w8a8_block,torch.float16,56,2048,7168,8,8,0,0,asm,12003+22000,353.4456 +gfx938,f8_w8a8_block,torch.float16,64,2048,7168,8,8,0,0,asm,12003+22000,331.7981 +gfx938,f8_w8a8_block,torch.float16,72,2048,7168,8,8,0,0,asm,13001+22000,456.1231 +gfx938,f8_w8a8_block,torch.float16,80,2048,7168,8,8,0,0,asm,13001+22000,424.8726 +gfx938,f8_w8a8_block,torch.float16,88,2048,7168,8,8,0,0,asm,13001+22000,431.4494 +gfx938,f8_w8a8_block,torch.float16,90,2048,7168,8,8,0,0,asm,13001+22000,435.4494 +gfx938,f8_w8a8_block,torch.float16,96,2048,7168,8,8,0,0,asm,13001+22000,441.8662 +gfx938,f8_w8a8_block,torch.float16,100,2048,7168,8,8,0,0,asm,13001+22000,430.2873 +gfx938,f8_w8a8_block,torch.float16,112,2048,7168,8,8,0,0,asm,13001+22000,446.6915 +gfx938,f8_w8a8_block,torch.float16,128,2048,7168,8,8,0,0,asm,13001+22000,438.5177 +gfx938,f8_w8a8_block,torch.float16,160,2048,7168,8,8,0,0,asm,12005+22000,614.5056 +gfx938,f8_w8a8_block,torch.float16,224,2048,7168,8,8,0,0,asm,13001+22000,739.8105 +gfx938,f8_w8a8_block,torch.float16,256,2048,7168,8,8,0,0,asm,13001+22000,744.9672 +gfx938,f8_w8a8_block,torch.float16,384,2048,7168,8,8,0,0,asm,13001+22000,1087.3912 +gfx938,f8_w8a8_block,torch.float16,512,2048,7168,8,8,0,0,asm,13001+22000,1391.6428 +gfx938,f8_w8a8_block,torch.float16,768,2048,7168,8,8,0,0,asm,13001+23000,1964.0364 +gfx938,f8_w8a8_block,torch.float16,1024,2048,7168,8,8,0,0,asm,13001+23000,2507.9386 +gfx938,f8_w8a8_block,torch.float16,1536,2048,7168,8,8,0,0,asm,13001+23000,3636.7677 +gfx938,f8_w8a8_block,torch.float16,2048,2048,7168,8,8,0,0,asm,13001+23000,4742.7426 +gfx938,f8_w8a8_block,torch.float16,3072,2048,7168,8,8,0,0,asm,13001+23000,6988.6465 +gfx938,f8_w8a8_block,torch.float16,3584,2048,7168,8,8,0,0,asm,13001+23000,8090.6961 +gfx938,f8_w8a8_block,torch.float16,3840,2048,7168,8,8,0,0,asm,13001+23000,8696.184 +gfx938,f8_w8a8_block,torch.float16,4096,2048,7168,8,8,0,0,asm,13001+23000,9192.0232 +gfx938,f8_w8a8_block,torch.float16,5120,2048,7168,8,8,0,0,asm,13001+23000,11496.301 +gfx938,f8_w8a8_block,torch.float16,6144,2048,7168,8,8,0,0,asm,13001+23000,13715.0042 +gfx938,f8_w8a8_block,torch.float16,7168,2048,7168,8,8,0,0,asm,13001+23000,16126.7491 +gfx938,f8_w8a8_block,torch.float16,8192,2048,7168,8,8,0,0,asm,13001+23000,18519.7223 +gfx938,f8_w8a8_block,torch.float16,12288,2048,7168,8,8,0,0,asm,13001+23000,27498.3048 +gfx938,f8_w8a8_block,torch.float16,16384,2048,7168,8,8,0,0,asm,13001+23000,36258.1284 +gfx938,f8_w8a8_block,torch.float16,24576,2048,7168,8,8,0,0,asm,13001+23000,53518.8203 +gfx938,f8_w8a8_block,torch.float16,32768,2048,7168,8,8,0,0,asm,13001+23000,71472.2139 +gfx938,f8_w8a8_block,torch.float16,1,256,7168,257,9,0,0,asm,10007+20000,73.9463 +gfx938,f8_w8a8_block,torch.float16,2,256,7168,257,9,0,0,asm,10002+20000,103.0494 +gfx938,f8_w8a8_block,torch.float16,4,256,7168,257,9,0,0,asm,10002+20000,157.7439 +gfx938,f8_w8a8_block,torch.float16,6,256,7168,257,9,0,0,asm,10002+20000,211.7985 +gfx938,f8_w8a8_block,torch.float16,8,256,7168,257,9,0,0,asm,10006+20000,253.7856 +gfx938,f8_w8a8_block,torch.float16,12,256,7168,257,9,0,0,asm,10002+20000,331.7222 +gfx938,f8_w8a8_block,torch.float16,16,256,7168,257,9,0,0,asm,10002+20000,407.5114 +gfx938,f8_w8a8_block,torch.float16,20,256,7168,257,9,0,0,asm,10005+20000,465.2345 +gfx938,f8_w8a8_block,torch.float16,24,256,7168,257,9,0,0,asm,10002+20000,519.9458 +gfx938,f8_w8a8_block,torch.float16,28,256,7168,257,9,0,0,asm,10002+20000,565.4308 +gfx938,f8_w8a8_block,torch.float16,32,256,7168,257,9,0,0,asm,10006+20000,592.3696 +gfx938,f8_w8a8_block,torch.float16,36,256,7168,257,9,0,0,asm,10001+20000,620.1476 +gfx938,f8_w8a8_block,torch.float16,40,256,7168,257,9,0,0,asm,10002+20000,642.4296 +gfx938,f8_w8a8_block,torch.float16,44,256,7168,257,9,0,0,asm,10002+20000,651.9706 +gfx938,f8_w8a8_block,torch.float16,48,256,7168,257,9,0,0,asm,10002+20000,686.421 +gfx938,f8_w8a8_block,torch.float16,56,256,7168,257,9,0,0,asm,10002+20000,736.5008 +gfx938,f8_w8a8_block,torch.float16,64,256,7168,257,9,0,0,asm,10002+20000,745.9071 +gfx938,f8_w8a8_block,torch.float16,72,256,7168,257,9,0,0,asm,10002+20000,776.5091 +gfx938,f8_w8a8_block,torch.float16,80,256,7168,257,9,0,0,asm,10002+20000,793.5026 +gfx938,f8_w8a8_block,torch.float16,96,256,7168,257,9,0,0,asm,10002+20000,817.8563 +gfx938,f8_w8a8_block,torch.float16,104,256,7168,257,9,0,0,asm,10002+20000,822.9004 +gfx938,f8_w8a8_block,torch.float16,112,256,7168,257,9,0,0,asm,10002+20000,826.3278 +gfx938,f8_w8a8_block,torch.float16,128,256,7168,257,9,0,0,asm,10002+20000,835.1446 +gfx938,f8_w8a8_block,torch.float16,144,256,7168,257,9,0,0,asm,10002+20000,848.1887 +gfx938,f8_w8a8_block,torch.float16,160,256,7168,257,9,0,0,asm,10001+20000,853.1572 +gfx938,f8_w8a8_block,torch.float16,192,256,7168,257,9,0,0,asm,10001+20000,864.9213 +gfx938,f8_w8a8_block,torch.float16,224,256,7168,257,9,0,0,asm,10002+20000,872.4666 +gfx938,f8_w8a8_block,torch.float16,256,256,7168,257,9,0,0,asm,10002+20000,891.0096 +gfx938,f8_w8a8_block,torch.float16,320,256,7168,257,9,0,0,asm,10005+20000,922.6053 +gfx938,f8_w8a8_block,torch.float16,384,256,7168,257,9,0,0,asm,10006+20000,963.8683 +gfx938,f8_w8a8_block,torch.float16,448,256,7168,257,9,0,0,asm,11010+21000,981.4514 +gfx938,f8_w8a8_block,torch.float16,512,256,7168,257,9,0,0,asm,11009+21000,1019.9692 +gfx938,f8_w8a8_block,torch.float16,768,256,7168,257,9,0,0,asm,11010+21000,1092.5163 +gfx938,f8_w8a8_block,torch.float16,896,256,7168,257,9,0,0,asm,11010+21000,1171.3539 +gfx938,f8_w8a8_block,torch.float16,960,256,7168,257,9,0,0,asm,11010+21000,1275.7745 +gfx938,f8_w8a8_block,torch.float16,1024,256,7168,257,9,0,0,asm,11010+21000,1363.8584 +gfx938,f8_w8a8_block,torch.float16,1280,256,7168,257,9,0,0,asm,12003+22000,1408.9024 +gfx938,f8_w8a8_block,torch.float16,1536,256,7168,257,9,0,0,asm,12003+22000,1494.5274 +gfx938,f8_w8a8_block,torch.float16,1920,256,7168,257,9,0,0,asm,12005+22000,1886.1974 +gfx938,f8_w8a8_block,torch.float16,2048,256,7168,257,9,0,0,asm,12006+22000,2027.5611 +gfx938,f8_w8a8_block,torch.float16,2304,256,7168,257,9,0,0,asm,13001+23000,2184.8658 +gfx938,f8_w8a8_block,torch.float16,2560,256,7168,257,9,0,0,asm,13001+23000,2272.7138 +gfx938,f8_w8a8_block,torch.float16,3072,256,7168,257,9,0,0,asm,13001+23000,2400.5871 +gfx938,f8_w8a8_block,torch.float16,3584,256,7168,257,9,0,0,asm,12005+22000,2762.3288 +gfx938,f8_w8a8_block,torch.float16,3840,256,7168,257,9,0,0,asm,12005+22000,3116.4495 +gfx938,f8_w8a8_block,torch.float16,4096,256,7168,257,9,0,0,asm,12005+22000,3296.3478 +gfx938,f8_w8a8_block,torch.float16,4608,256,7168,257,9,0,0,asm,12005+22000,3493.6523 +gfx938,f8_w8a8_block,torch.float16,5120,256,7168,257,9,0,0,asm,12005+22000,3704.4556 +gfx938,f8_w8a8_block,torch.float16,6144,256,7168,257,9,0,0,asm,13001+23000,4279.6198 +gfx938,f8_w8a8_block,torch.float16,7168,256,7168,257,9,0,0,asm,13001+23000,4927.3562 +gfx938,f8_w8a8_block,torch.float16,8192,256,7168,257,9,0,0,asm,12005+22000,5789.3569 +gfx938,f8_w8a8_block,torch.float16,10240,256,7168,257,9,0,0,asm,13001+23000,6638.1284 +gfx938,f8_w8a8_block,torch.float16,12288,256,7168,257,9,0,0,asm,13001+23000,8305.4483 +gfx938,f8_w8a8_block,torch.float16,16384,256,7168,257,9,0,0,asm,13001+23000,10449.6253 +gfx938,f8_w8a8_block,torch.float16,24576,256,7168,257,9,0,0,asm,13001+23000,15201.8004 +gfx938,f8_w8a8_block,torch.float16,32768,256,7168,257,9,0,0,asm,13001+23000,20098.0089 +gfx938,f8_w8a8_block,torch.float16,1,256,6144,256,8,0,0,asm,10007+20000,67.6813 +gfx938,f8_w8a8_block,torch.float16,2,256,6144,256,8,0,0,asm,10007+20000,93.7608 +gfx938,f8_w8a8_block,torch.float16,4,256,6144,256,8,0,0,asm,10002+20000,138.3419 +gfx938,f8_w8a8_block,torch.float16,6,256,6144,256,8,0,0,asm,10002+20000,179.6723 +gfx938,f8_w8a8_block,torch.float16,8,256,6144,256,8,0,0,asm,10002+20000,217.9458 +gfx938,f8_w8a8_block,torch.float16,12,256,6144,256,8,0,0,asm,10002+20000,275.4782 +gfx938,f8_w8a8_block,torch.float16,16,256,6144,256,8,0,0,asm,10002+20000,321.9033 +gfx938,f8_w8a8_block,torch.float16,20,256,6144,256,8,0,0,asm,10002+20000,370.4559 +gfx938,f8_w8a8_block,torch.float16,24,256,6144,256,8,0,0,asm,10002+20000,413.7231 +gfx938,f8_w8a8_block,torch.float16,28,256,6144,256,8,0,0,asm,10002+20000,470.2396 +gfx938,f8_w8a8_block,torch.float16,32,256,6144,256,8,0,0,asm,10002+20000,494.3489 +gfx938,f8_w8a8_block,torch.float16,36,256,6144,256,8,0,0,asm,10002+20000,514.7164 +gfx938,f8_w8a8_block,torch.float16,40,256,6144,256,8,0,0,asm,10002+20000,535.8532 +gfx938,f8_w8a8_block,torch.float16,44,256,6144,256,8,0,0,asm,10002+20000,549.3352 +gfx938,f8_w8a8_block,torch.float16,48,256,6144,256,8,0,0,asm,10002+20000,566.0256 +gfx938,f8_w8a8_block,torch.float16,56,256,6144,256,8,0,0,asm,10001+20000,595.3224 +gfx938,f8_w8a8_block,torch.float16,64,256,6144,256,8,0,0,asm,10002+20000,617.7644 +gfx938,f8_w8a8_block,torch.float16,72,256,6144,256,8,0,0,asm,10002+20000,634.5306 +gfx938,f8_w8a8_block,torch.float16,80,256,6144,256,8,0,0,asm,10002+20000,651.9285 +gfx938,f8_w8a8_block,torch.float16,96,256,6144,256,8,0,0,asm,10002+20000,682.6315 +gfx938,f8_w8a8_block,torch.float16,104,256,6144,256,8,0,0,asm,10002+20000,688.2567 +gfx938,f8_w8a8_block,torch.float16,112,256,6144,256,8,0,0,asm,10002+20000,698.4546 +gfx938,f8_w8a8_block,torch.float16,128,256,6144,256,8,0,0,asm,10002+20000,712.0882 +gfx938,f8_w8a8_block,torch.float16,144,256,6144,256,8,0,0,asm,10002+20000,718.6482 +gfx938,f8_w8a8_block,torch.float16,160,256,6144,256,8,0,0,asm,10002+20000,730.6565 +gfx938,f8_w8a8_block,torch.float16,192,256,6144,256,8,0,0,asm,10002+20000,739.8271 +gfx938,f8_w8a8_block,torch.float16,224,256,6144,256,8,0,0,asm,10002+20000,741.7386 +gfx938,f8_w8a8_block,torch.float16,256,256,6144,256,8,0,0,asm,10001+20000,754.8923 +gfx938,f8_w8a8_block,torch.float16,320,256,6144,256,8,0,0,asm,10002+20000,770.9764 +gfx938,f8_w8a8_block,torch.float16,384,256,6144,256,8,0,0,asm,10002+20000,800.1889 +gfx938,f8_w8a8_block,torch.float16,448,256,6144,256,8,0,0,asm,11007+21000,836.1298 +gfx938,f8_w8a8_block,torch.float16,512,256,6144,256,8,0,0,asm,11007+21000,857.3255 +gfx938,f8_w8a8_block,torch.float16,768,256,6144,256,8,0,0,asm,11007+21000,932.2726 +gfx938,f8_w8a8_block,torch.float16,896,256,6144,256,8,0,0,asm,11010+21000,956.6094 +gfx938,f8_w8a8_block,torch.float16,960,256,6144,256,8,0,0,asm,11010+21000,1000.4997 +gfx938,f8_w8a8_block,torch.float16,1024,256,6144,256,8,0,0,asm,11010+21000,1056.4489 +gfx938,f8_w8a8_block,torch.float16,1280,256,6144,256,8,0,0,asm,12003+22000,1205.4336 +gfx938,f8_w8a8_block,torch.float16,1536,256,6144,256,8,0,0,asm,12003+22000,1248.1451 +gfx938,f8_w8a8_block,torch.float16,1920,256,6144,256,8,0,0,asm,12003+22000,1397.3318 +gfx938,f8_w8a8_block,torch.float16,2048,256,6144,256,8,0,0,asm,12005+22000,1552.9186 +gfx938,f8_w8a8_block,torch.float16,2304,256,6144,256,8,0,0,asm,12005+22000,1788.5303 +gfx938,f8_w8a8_block,torch.float16,2560,256,6144,256,8,0,0,asm,12005+22000,1912.6982 +gfx938,f8_w8a8_block,torch.float16,3072,256,6144,256,8,0,0,asm,13001+23000,2027.8051 +gfx938,f8_w8a8_block,torch.float16,3584,256,6144,256,8,0,0,asm,12005+22000,2141.1773 +gfx938,f8_w8a8_block,torch.float16,3840,256,6144,256,8,0,0,asm,12005+22000,2282.5494 +gfx938,f8_w8a8_block,torch.float16,4096,256,6144,256,8,0,0,asm,12005+22000,2508.3084 +gfx938,f8_w8a8_block,torch.float16,4608,256,6144,256,8,0,0,asm,12005+22000,2875.9616 +gfx938,f8_w8a8_block,torch.float16,5120,256,6144,256,8,0,0,asm,12005+22000,3084.045 +gfx938,f8_w8a8_block,torch.float16,6144,256,6144,256,8,0,0,asm,12005+22000,3509.3738 +gfx938,f8_w8a8_block,torch.float16,7168,256,6144,256,8,0,0,asm,13001+23000,3791.5959 +gfx938,f8_w8a8_block,torch.float16,8192,256,6144,256,8,0,0,asm,13001+23000,4491.9889 +gfx938,f8_w8a8_block,torch.float16,10240,256,6144,256,8,0,0,asm,13001+23000,5509.2227 +gfx938,f8_w8a8_block,torch.float16,12288,256,6144,256,8,0,0,asm,13001+23000,6313.4469 +gfx938,f8_w8a8_block,torch.float16,16384,256,6144,256,8,0,0,asm,13001+23000,8165.8268 +gfx938,f8_w8a8_block,torch.float16,24576,256,6144,256,8,0,0,asm,13001+23000,11880.1362 +gfx938,f8_w8a8_block,torch.float16,32768,256,6144,256,8,0,0,asm,13001+23000,15682.8069 +gfx938,f8_w8a8_block,torch.float16,1,256,6144,257,9,0,0,asm,10007+20000,70.5191 +gfx938,f8_w8a8_block,torch.float16,2,256,6144,257,9,0,0,asm,10007+20000,98.4684 +gfx938,f8_w8a8_block,torch.float16,4,256,6144,257,9,0,0,asm,10002+20000,148.6071 +gfx938,f8_w8a8_block,torch.float16,6,256,6144,257,9,0,0,asm,10002+20000,191.8575 +gfx938,f8_w8a8_block,torch.float16,8,256,6144,257,9,0,0,asm,10002+20000,232.4299 +gfx938,f8_w8a8_block,torch.float16,12,256,6144,257,9,0,0,asm,10001+20000,301.6255 +gfx938,f8_w8a8_block,torch.float16,16,256,6144,257,9,0,0,asm,10002+20000,361.3053 +gfx938,f8_w8a8_block,torch.float16,20,256,6144,257,9,0,0,asm,10002+20000,407.3989 +gfx938,f8_w8a8_block,torch.float16,24,256,6144,257,9,0,0,asm,10002+20000,454.4134 +gfx938,f8_w8a8_block,torch.float16,28,256,6144,257,9,0,0,asm,10002+20000,497.0436 +gfx938,f8_w8a8_block,torch.float16,32,256,6144,257,9,0,0,asm,10002+20000,519.1488 +gfx938,f8_w8a8_block,torch.float16,36,256,6144,257,9,0,0,asm,10002+20000,533.7563 +gfx938,f8_w8a8_block,torch.float16,40,256,6144,257,9,0,0,asm,10002+20000,555.2046 +gfx938,f8_w8a8_block,torch.float16,44,256,6144,257,9,0,0,asm,10002+20000,563.3562 +gfx938,f8_w8a8_block,torch.float16,48,256,6144,257,9,0,0,asm,10002+20000,592.3582 +gfx938,f8_w8a8_block,torch.float16,56,256,6144,257,9,0,0,asm,10002+20000,634.9769 +gfx938,f8_w8a8_block,torch.float16,64,256,6144,257,9,0,0,asm,10002+20000,648.5771 +gfx938,f8_w8a8_block,torch.float16,72,256,6144,257,9,0,0,asm,10002+20000,663.4401 +gfx938,f8_w8a8_block,torch.float16,80,256,6144,257,9,0,0,asm,10002+20000,680.9811 +gfx938,f8_w8a8_block,torch.float16,96,256,6144,257,9,0,0,asm,10001+20000,700.9809 +gfx938,f8_w8a8_block,torch.float16,104,256,6144,257,9,0,0,asm,10002+20000,713.1156 +gfx938,f8_w8a8_block,torch.float16,112,256,6144,257,9,0,0,asm,10002+20000,717.2166 +gfx938,f8_w8a8_block,torch.float16,128,256,6144,257,9,0,0,asm,10001+20000,723.4987 +gfx938,f8_w8a8_block,torch.float16,144,256,6144,257,9,0,0,asm,10002+20000,724.9219 +gfx938,f8_w8a8_block,torch.float16,160,256,6144,257,9,0,0,asm,10002+20000,737.0734 +gfx938,f8_w8a8_block,torch.float16,192,256,6144,257,9,0,0,asm,10001+20000,742.9428 +gfx938,f8_w8a8_block,torch.float16,224,256,6144,257,9,0,0,asm,10001+20000,754.3533 +gfx938,f8_w8a8_block,torch.float16,256,256,6144,257,9,0,0,asm,10002+20000,768.6522 +gfx938,f8_w8a8_block,torch.float16,320,256,6144,257,9,0,0,asm,10002+20000,791.4058 +gfx938,f8_w8a8_block,torch.float16,384,256,6144,257,9,0,0,asm,10006+20000,832.4583 +gfx938,f8_w8a8_block,torch.float16,448,256,6144,257,9,0,0,asm,11007+21000,848.6097 +gfx938,f8_w8a8_block,torch.float16,512,256,6144,257,9,0,0,asm,11010+21000,878.3359 +gfx938,f8_w8a8_block,torch.float16,768,256,6144,257,9,0,0,asm,11010+21000,943.6241 +gfx938,f8_w8a8_block,torch.float16,896,256,6144,257,9,0,0,asm,11009+21000,1048.4237 +gfx938,f8_w8a8_block,torch.float16,960,256,6144,257,9,0,0,asm,11010+21000,1100.1456 +gfx938,f8_w8a8_block,torch.float16,1024,256,6144,257,9,0,0,asm,11010+21000,1152.9031 +gfx938,f8_w8a8_block,torch.float16,1280,256,6144,257,9,0,0,asm,12003+22000,1223.6062 +gfx938,f8_w8a8_block,torch.float16,1536,256,6144,257,9,0,0,asm,12002+22000,1305.5679 +gfx938,f8_w8a8_block,torch.float16,1920,256,6144,257,9,0,0,asm,12005+22000,1656.2444 +gfx938,f8_w8a8_block,torch.float16,2048,256,6144,257,9,0,0,asm,12005+22000,1770.1303 +gfx938,f8_w8a8_block,torch.float16,2304,256,6144,257,9,0,0,asm,13001+23000,1911.9824 +gfx938,f8_w8a8_block,torch.float16,2560,256,6144,257,9,0,0,asm,13001+23000,1980.1926 +gfx938,f8_w8a8_block,torch.float16,3072,256,6144,257,9,0,0,asm,13001+23000,2101.4216 +gfx938,f8_w8a8_block,torch.float16,3584,256,6144,257,9,0,0,asm,12005+22000,2446.9782 +gfx938,f8_w8a8_block,torch.float16,3840,256,6144,257,9,0,0,asm,12005+22000,2696.7961 +gfx938,f8_w8a8_block,torch.float16,4096,256,6144,257,9,0,0,asm,12005+22000,2876.2059 +gfx938,f8_w8a8_block,torch.float16,4608,256,6144,257,9,0,0,asm,12005+22000,3061.4851 +gfx938,f8_w8a8_block,torch.float16,5120,256,6144,257,9,0,0,asm,12005+22000,3218.2929 +gfx938,f8_w8a8_block,torch.float16,6144,256,6144,257,9,0,0,asm,13001+23000,3724.1098 +gfx938,f8_w8a8_block,torch.float16,7168,256,6144,257,9,0,0,asm,13001+23000,4285.598 +gfx938,f8_w8a8_block,torch.float16,8192,256,6144,257,9,0,0,asm,12005+22000,5012.2561 +gfx938,f8_w8a8_block,torch.float16,10240,256,6144,257,9,0,0,asm,13001+23000,5764.6995 +gfx938,f8_w8a8_block,torch.float16,12288,256,6144,257,9,0,0,asm,13001+23000,7196.6686 +gfx938,f8_w8a8_block,torch.float16,16384,256,6144,257,9,0,0,asm,13001+23000,9152.8711 +gfx938,f8_w8a8_block,torch.float16,24576,256,6144,257,9,0,0,asm,13001+23000,13116.0298 +gfx938,f8_w8a8_block,torch.float16,32768,256,6144,257,9,0,0,asm,13001+23000,17301.5285 +gfx938,f8_w8a8_block,torch.float16,1,384,3072,256,8,0,0,asm,10001+20000,65.795 +gfx938,f8_w8a8_block,torch.float16,2,384,3072,256,8,0,0,asm,10006+20000,83.7907 +gfx938,f8_w8a8_block,torch.float16,4,384,3072,256,8,0,0,asm,10002+20000,125.6179 +gfx938,f8_w8a8_block,torch.float16,6,384,3072,256,8,0,0,asm,10006+20000,149.9967 +gfx938,f8_w8a8_block,torch.float16,8,384,3072,256,8,0,0,asm,10002+20000,176.7839 +gfx938,f8_w8a8_block,torch.float16,12,384,3072,256,8,0,0,asm,10002+20000,227.4028 +gfx938,f8_w8a8_block,torch.float16,16,384,3072,256,8,0,0,asm,10002+20000,264.9185 +gfx938,f8_w8a8_block,torch.float16,20,384,3072,256,8,0,0,asm,10002+20000,300.1154 +gfx938,f8_w8a8_block,torch.float16,24,384,3072,256,8,0,0,asm,10002+20000,335.7783 +gfx938,f8_w8a8_block,torch.float16,28,384,3072,256,8,0,0,asm,10002+20000,378.2906 +gfx938,f8_w8a8_block,torch.float16,32,384,3072,256,8,0,0,asm,10002+20000,399.1411 +gfx938,f8_w8a8_block,torch.float16,36,384,3072,256,8,0,0,asm,10002+20000,410.4643 +gfx938,f8_w8a8_block,torch.float16,40,384,3072,256,8,0,0,asm,10002+20000,426.2791 +gfx938,f8_w8a8_block,torch.float16,44,384,3072,256,8,0,0,asm,10002+20000,445.6137 +gfx938,f8_w8a8_block,torch.float16,48,384,3072,256,8,0,0,asm,10002+20000,451.6853 +gfx938,f8_w8a8_block,torch.float16,56,384,3072,256,8,0,0,asm,10001+20000,474.1946 +gfx938,f8_w8a8_block,torch.float16,64,384,3072,256,8,0,0,asm,10002+20000,492.3841 +gfx938,f8_w8a8_block,torch.float16,72,384,3072,256,8,0,0,asm,10002+20000,506.1860 +gfx938,f8_w8a8_block,torch.float16,80,384,3072,256,8,0,0,asm,10002+20000,516.9145 +gfx938,f8_w8a8_block,torch.float16,96,384,3072,256,8,0,0,asm,10002+20000,535.4744 +gfx938,f8_w8a8_block,torch.float16,104,384,3072,256,8,0,0,asm,10002+20000,542.3038 +gfx938,f8_w8a8_block,torch.float16,112,384,3072,256,8,0,0,asm,10002+20000,545.5796 +gfx938,f8_w8a8_block,torch.float16,128,384,3072,256,8,0,0,asm,10001+20000,558.7754 +gfx938,f8_w8a8_block,torch.float16,144,384,3072,256,8,0,0,asm,10002+20000,564.8554 +gfx938,f8_w8a8_block,torch.float16,160,384,3072,256,8,0,0,asm,10002+20000,564.2491 +gfx938,f8_w8a8_block,torch.float16,192,384,3072,256,8,0,0,asm,10001+20000,573.4617 +gfx938,f8_w8a8_block,torch.float16,224,384,3072,256,8,0,0,asm,10002+20000,579.6174 +gfx938,f8_w8a8_block,torch.float16,256,384,3072,256,8,0,0,asm,10002+20000,583.6848 +gfx938,f8_w8a8_block,torch.float16,320,384,3072,256,8,0,0,asm,10002+20000,595.5837 +gfx938,f8_w8a8_block,torch.float16,384,384,3072,256,8,0,0,asm,10006+20000,619.592 +gfx938,f8_w8a8_block,torch.float16,448,384,3072,256,8,0,0,asm,11010+21000,628.4425 +gfx938,f8_w8a8_block,torch.float16,512,384,3072,256,8,0,0,asm,11010+21000,641.6214 +gfx938,f8_w8a8_block,torch.float16,768,384,3072,256,8,0,0,asm,11010+21000,721.0822 +gfx938,f8_w8a8_block,torch.float16,896,384,3072,256,8,0,0,asm,11010+21000,708.9475 +gfx938,f8_w8a8_block,torch.float16,960,384,3072,256,8,0,0,asm,11010+21000,746.0674 +gfx938,f8_w8a8_block,torch.float16,1024,384,3072,256,8,0,0,asm,11010+21000,774.5641 +gfx938,f8_w8a8_block,torch.float16,1280,384,3072,256,8,0,0,asm,12003+22000,845.0818 +gfx938,f8_w8a8_block,torch.float16,1536,384,3072,256,8,0,0,asm,12003+22000,916.7026 +gfx938,f8_w8a8_block,torch.float16,1920,384,3072,256,8,0,0,asm,12003+22000,998.3023 +gfx938,f8_w8a8_block,torch.float16,2048,384,3072,256,8,0,0,asm,12005+22000,1088.9209 +gfx938,f8_w8a8_block,torch.float16,2304,384,3072,256,8,0,0,asm,12005+22000,1274.0655 +gfx938,f8_w8a8_block,torch.float16,2560,384,3072,256,8,0,0,asm,13001+22000,1351.3958 +gfx938,f8_w8a8_block,torch.float16,3072,384,3072,256,8,0,0,asm,13001+22000,1402.7135 +gfx938,f8_w8a8_block,torch.float16,3584,384,3072,256,8,0,0,asm,12005+22000,1488.7932 +gfx938,f8_w8a8_block,torch.float16,3840,384,3072,256,8,0,0,asm,12005+22000,1584.9023 +gfx938,f8_w8a8_block,torch.float16,4096,384,3072,256,8,0,0,asm,12005+22000,1764.2532 +gfx938,f8_w8a8_block,torch.float16,4608,384,3072,256,8,0,0,asm,12005+22000,1992.3282 +gfx938,f8_w8a8_block,torch.float16,5120,384,3072,256,8,0,0,asm,12005+22000,2099.8142 +gfx938,f8_w8a8_block,torch.float16,6144,384,3072,256,8,0,0,asm,12005+22000,2443.4087 +gfx938,f8_w8a8_block,torch.float16,7168,384,3072,256,8,0,0,asm,13001+23000,2574.2547 +gfx938,f8_w8a8_block,torch.float16,8192,384,3072,256,8,0,0,asm,13001+23000,3029.9331 +gfx938,f8_w8a8_block,torch.float16,10240,384,3072,256,8,0,0,asm,13001+23000,3706.9581 +gfx938,f8_w8a8_block,torch.float16,12288,384,3072,256,8,0,0,asm,13001+23000,4301.5159 +gfx938,f8_w8a8_block,torch.float16,16384,384,3072,256,8,0,0,asm,13001+23000,5563.0188 +gfx938,f8_w8a8_block,torch.float16,24576,384,3072,256,8,0,0,asm,13001+23000,7960.9725 +gfx938,f8_w8a8_block,torch.float16,32768,384,3072,256,8,0,0,asm,13001+23000,10497.4939 +gfx938,f8_w8a8_block,torch.float16,1,1536,3072,64,8,0,0,asm,10002+20000,118.2831 +gfx938,f8_w8a8_block,torch.float16,2,1536,3072,64,8,0,0,asm,10002+20000,172.9438 +gfx938,f8_w8a8_block,torch.float16,4,1536,3072,64,8,0,0,asm,10006+20000,280.0004 +gfx938,f8_w8a8_block,torch.float16,6,1536,3072,64,8,0,0,asm,10002+20000,357.1707 +gfx938,f8_w8a8_block,torch.float16,8,1536,3072,64,8,0,0,asm,10001+20000,381.2043 +gfx938,f8_w8a8_block,torch.float16,12,1536,3072,64,8,0,0,asm,10002+20000,445.6251 +gfx938,f8_w8a8_block,torch.float16,16,1536,3072,64,8,0,0,asm,10002+20000,485.1029 +gfx938,f8_w8a8_block,torch.float16,20,1536,3072,64,8,0,0,asm,10002+20000,520.2745 +gfx938,f8_w8a8_block,torch.float16,24,1536,3072,64,8,0,0,asm,10002+20000,528.2071 +gfx938,f8_w8a8_block,torch.float16,28,1536,3072,64,8,0,0,asm,10002+20000,544.5385 +gfx938,f8_w8a8_block,torch.float16,32,1536,3072,64,8,0,0,asm,10002+20000,549.3889 +gfx938,f8_w8a8_block,torch.float16,36,1536,3072,64,8,0,0,asm,10002+20000,557.2091 +gfx938,f8_w8a8_block,torch.float16,40,1536,3072,64,8,0,0,asm,10002+20000,559.5248 +gfx938,f8_w8a8_block,torch.float16,44,1536,3072,64,8,0,0,asm,10002+20000,568.5774 +gfx938,f8_w8a8_block,torch.float16,48,1536,3072,64,8,0,0,asm,10002+20000,576.7458 +gfx938,f8_w8a8_block,torch.float16,56,1536,3072,64,8,0,0,asm,10002+20000,580.5437 +gfx938,f8_w8a8_block,torch.float16,64,1536,3072,64,8,0,0,asm,10002+20000,581.052 +gfx938,f8_w8a8_block,torch.float16,72,1536,3072,64,8,0,0,asm,10002+20000,576.6869 +gfx938,f8_w8a8_block,torch.float16,80,1536,3072,64,8,0,0,asm,10001+20000,583.9037 +gfx938,f8_w8a8_block,torch.float16,96,1536,3072,64,8,0,0,asm,11008+21000,643.0025 +gfx938,f8_w8a8_block,torch.float16,104,1536,3072,64,8,0,0,asm,11010+21000,603.3142 +gfx938,f8_w8a8_block,torch.float16,112,1536,3072,64,8,0,0,asm,11010+21000,604.4005 +gfx938,f8_w8a8_block,torch.float16,128,1536,3072,64,8,0,0,asm,11010+21000,621.9529 +gfx938,f8_w8a8_block,torch.float16,144,1536,3072,64,8,0,0,asm,11010+21000,617.4194 +gfx938,f8_w8a8_block,torch.float16,160,1536,3072,64,8,0,0,asm,11010+21000,619.9457 +gfx938,f8_w8a8_block,torch.float16,192,1536,3072,64,8,0,0,asm,11009+21000,677.4613 +gfx938,f8_w8a8_block,torch.float16,224,1536,3072,64,8,0,0,asm,11010+21000,660.3666 +gfx938,f8_w8a8_block,torch.float16,256,1536,3072,64,8,0,0,asm,12003+22000,682.2528 +gfx938,f8_w8a8_block,torch.float16,320,1536,3072,64,8,0,0,asm,12003+22000,685.7728 +gfx938,f8_w8a8_block,torch.float16,384,1536,3072,64,8,0,0,asm,12002+22000,710.6569 +gfx938,f8_w8a8_block,torch.float16,448,1536,3072,64,8,0,0,asm,12003+22000,729.0737 +gfx938,f8_w8a8_block,torch.float16,512,1536,3072,64,8,0,0,asm,12003+22000,880.7616 +gfx938,f8_w8a8_block,torch.float16,768,1536,3072,64,8,0,0,asm,13001+22000,996.9717 +gfx938,f8_w8a8_block,torch.float16,896,1536,3072,64,8,0,0,asm,13001+22000,1021.9148 +gfx938,f8_w8a8_block,torch.float16,960,1536,3072,64,8,0,0,asm,13001+22000,1146.7902 +gfx938,f8_w8a8_block,torch.float16,1024,1536,3072,64,8,0,0,asm,12005+22000,1338.7137 +gfx938,f8_w8a8_block,torch.float16,1280,1536,3072,64,8,0,0,asm,12005+22000,1541.1298 +gfx938,f8_w8a8_block,torch.float16,1536,1536,3072,64,8,0,0,asm,12005+22000,1845.1709 +gfx938,f8_w8a8_block,torch.float16,1920,1536,3072,64,8,0,0,asm,13001+22000,1981.4062 +gfx938,f8_w8a8_block,torch.float16,2048,1536,3072,64,8,0,0,asm,13001+23000,2225.9443 +gfx938,f8_w8a8_block,torch.float16,2304,1536,3072,64,8,0,0,asm,12005+22000,2547.0715 +gfx938,f8_w8a8_block,torch.float16,2560,1536,3072,64,8,0,0,asm,13001+23000,2643.6354 +gfx938,f8_w8a8_block,torch.float16,3072,1536,3072,64,8,0,0,asm,13001+23000,3097.7981 +gfx938,f8_w8a8_block,torch.float16,3584,1536,3072,64,8,0,0,asm,13001+23000,3522.9755 +gfx938,f8_w8a8_block,torch.float16,3840,1536,3072,64,8,0,0,asm,13001+23000,3585.7374 +gfx938,f8_w8a8_block,torch.float16,4096,1536,3072,64,8,0,0,asm,13001+23000,3955.6814 +gfx938,f8_w8a8_block,torch.float16,4608,1536,3072,64,8,0,0,asm,13001+23000,4386.8714 +gfx938,f8_w8a8_block,torch.float16,5120,1536,3072,64,8,0,0,asm,13001+23000,4794.1458 +gfx938,f8_w8a8_block,torch.float16,6144,1536,3072,64,8,0,0,asm,13001+23000,5649.5617 +gfx938,f8_w8a8_block,torch.float16,7168,1536,3072,64,8,0,0,asm,13001+23000,6510.7629 +gfx938,f8_w8a8_block,torch.float16,8192,1536,3072,64,8,0,0,asm,13001+23000,7415.2649 +gfx938,f8_w8a8_block,torch.float16,10240,1536,3072,64,8,0,0,asm,13001+23000,9181.1704 +gfx938,f8_w8a8_block,torch.float16,12288,1536,3072,64,8,0,0,asm,13001+23000,10876.1201 +gfx938,f8_w8a8_block,torch.float16,16384,1536,3072,64,8,0,0,asm,13001+23000,14314.7352 +gfx938,f8_w8a8_block,torch.float16,24576,1536,3072,64,8,0,0,asm,13001+23000,21336.6809 +gfx938,f8_w8a8_block,torch.float16,32768,1536,3072,64,8,0,0,asm,13001+23000,28266.1463 diff --git a/aiter/configs/tuned_fmoe_ck.csv b/aiter/configs/tuned_fmoe_ck.csv new file mode 100644 index 0000000000000000000000000000000000000000..ab3191223d1ffe2531ab28256c5278d7c2b19c59 --- /dev/null +++ b/aiter/configs/tuned_fmoe_ck.csv @@ -0,0 +1,31 @@ +quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k,sol_type,sol_id,time_us +no_quant,torch.float16,1,256,8192,256,4,0,0,ck,576,309.8021428571405 +no_quant,torch.float16,8,256,8192,256,4,0,0,ck,576,364.40785714285965 +no_quant,torch.float16,16,256,8192,256,4,0,0,ck,576,705.001571428565 +no_quant,torch.float16,24,256,8192,256,4,0,0,ck,272,1028.2664285714334 +no_quant,torch.float16,32,256,8192,256,4,0,0,ck,576,1293.2290000000055 +no_quant,torch.float16,48,256,8192,256,4,0,0,ck,272,1877.2707142857207 +no_quant,torch.float16,64,256,8192,256,4,0,0,ck,576,2193.247142857142 +no_quant,torch.float16,96,256,8192,256,4,0,0,ck,272,2471.966714285715 +no_quant,torch.float16,128,256,8192,256,4,0,0,ck,576,2724.7889999999948 +no_quant,torch.float16,256,256,8192,256,4,0,0,ck,576,3391.9192857142875 +no_quant,torch.float16,512,256,8192,256,4,0,0,ck,576,3537.358999999999 +no_quant,torch.float16,1024,256,8192,256,4,0,0,ck,272,5022.04200000001 +no_quant,torch.float16,2048,256,8192,256,4,0,0,ck,272,6217.148285714277 +no_quant,torch.float16,4096,256,8192,256,4,0,0,ck,272,8764.503571428571 +no_quant,torch.float16,8192,256,8192,256,4,0,0,ck,272,16520.489142857128 +no_quant,torch.float16,1,256,7168,256,8,0,0,ck,576,274.4422857142844 +no_quant,torch.float16,8,256,7168,256,8,0,0,ck,576,584.9560000000014 +no_quant,torch.float16,16,256,7168,256,8,0,0,ck,576,1056.360857142859 +no_quant,torch.float16,24,256,7168,256,8,0,0,ck,576,1480.196999999999 +no_quant,torch.float16,32,256,7168,256,8,0,0,ck,576,1867.970714285712 +no_quant,torch.float16,48,256,7168,256,8,0,0,ck,576,2312.378428571425 +no_quant,torch.float16,64,256,7168,256,8,0,0,ck,576,2382.001142857145 +no_quant,torch.float16,96,256,7168,256,8,0,0,ck,576,2795.691714285705 +no_quant,torch.float16,128,256,7168,256,8,0,0,ck,576,2820.2631428571426 +no_quant,torch.float16,256,256,7168,256,8,0,0,ck,576,3055.348428571423 +no_quant,torch.float16,512,256,7168,256,8,0,0,ck,272,4095.392285714287 +no_quant,torch.float16,1024,256,7168,256,8,0,0,ck,272,5514.589714285707 +no_quant,torch.float16,2048,256,7168,256,8,0,0,ck,272,7718.288428571438 +no_quant,torch.float16,4096,256,7168,256,8,0,0,ck,272,14435.327142857148 +no_quant,torch.float16,8192,256,7168,256,8,0,0,ck,272,27850.411142857163 diff --git a/aiter/configs/tuned_gemm.csv b/aiter/configs/tuned_gemm.csv new file mode 100644 index 0000000000000000000000000000000000000000..f05032bc7d28756ddcb9f96a6ef8b54f30ad98c7 --- /dev/null +++ b/aiter/configs/tuned_gemm.csv @@ -0,0 +1 @@ +M,N,K,bias,dtype,outdtype,scaleAB,libtype,solidx,soltimes,kernelName diff --git a/aiter/configs/untuned_fmoe.csv b/aiter/configs/untuned_fmoe.csv new file mode 100644 index 0000000000000000000000000000000000000000..b9245af1528fe893d4f54a0a6ba1c5eb77269dea --- /dev/null +++ b/aiter/configs/untuned_fmoe.csv @@ -0,0 +1,12 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +256,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_128x128,1,0 +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0 +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Token,1,0 +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.int4,QuantType.per_Tensor,1,0 +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0 +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Token,1,0 +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +512,6144,4096,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.int4,QuantType.per_Tensor,1,0 +4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1 \ No newline at end of file diff --git a/aiter/configs/untuned_fmoe_asm_w4a16.csv b/aiter/configs/untuned_fmoe_asm_w4a16.csv new file mode 100644 index 0000000000000000000000000000000000000000..b1675960e268d3a813faa6fbea1c9aa793fe46d8 --- /dev/null +++ b/aiter/configs/untuned_fmoe_asm_w4a16.csv @@ -0,0 +1,71 @@ +quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k +int4_w4a16,torch.float16,1,256,7168,256,8,0,0 +int4_w4a16,torch.float16,2,256,7168,256,8,0,0 +int4_w4a16,torch.float16,4,256,7168,256,8,0,0 +int4_w4a16,torch.float16,6,256,7168,256,8,0,0 +int4_w4a16,torch.float16,8,256,7168,256,8,0,0 +int4_w4a16,torch.float16,10,256,7168,256,8,0,0 +int4_w4a16,torch.float16,12,256,7168,256,8,0,0 +int4_w4a16,torch.float16,14,256,7168,256,8,0,0 +int4_w4a16,torch.float16,16,256,7168,256,8,0,0 +int4_w4a16,torch.float16,20,256,7168,256,8,0,0 +int4_w4a16,torch.float16,24,256,7168,256,8,0,0 +int4_w4a16,torch.float16,28,256,7168,256,8,0,0 +int4_w4a16,torch.float16,32,256,7168,256,8,0,0 +int4_w4a16,torch.float16,36,256,7168,256,8,0,0 +int4_w4a16,torch.float16,40,256,7168,256,8,0,0 +int4_w4a16,torch.float16,44,256,7168,256,8,0,0 +int4_w4a16,torch.float16,48,256,7168,256,8,0,0 +int4_w4a16,torch.float16,56,256,7168,256,8,0,0 +int4_w4a16,torch.float16,64,256,7168,256,8,0,0 +int4_w4a16,torch.float16,80,256,7168,256,8,0,0 +int4_w4a16,torch.float16,96,256,7168,256,8,0,0 +int4_w4a16,torch.float16,112,256,7168,256,8,0,0 +int4_w4a16,torch.float16,128,256,7168,256,8,0,0 +int4_w4a16,torch.float16,160,256,7168,256,8,0,0 +int4_w4a16,torch.float16,192,256,7168,256,8,0,0 +int4_w4a16,torch.float16,224,256,7168,256,8,0,0 +int4_w4a16,torch.float16,256,256,7168,256,8,0,0 +int4_w4a16,torch.float16,320,256,7168,256,8,0,0 +int4_w4a16,torch.float16,384,256,7168,256,8,0,0 +int4_w4a16,torch.float16,448,256,7168,256,8,0,0 +int4_w4a16,torch.float16,512,256,7168,256,8,0,0 +int4_w4a16,torch.float16,576,256,7168,256,8,0,0 +int4_w4a16,torch.float16,640,256,7168,256,8,0,0 +int4_w4a16,torch.float16,704,256,7168,256,8,0,0 +int4_w4a16,torch.float16,768,256,7168,256,8,0,0 +int4_w4a16,torch.float16,832,256,7168,256,8,0,0 +int4_w4a16,torch.float16,896,256,7168,256,8,0,0 +int4_w4a16,torch.float16,960,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1024,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1152,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1280,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1408,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1536,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1664,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1792,256,7168,256,8,0,0 +int4_w4a16,torch.float16,1920,256,7168,256,8,0,0 +int4_w4a16,torch.float16,2048,256,7168,256,8,0,0 +int4_w4a16,torch.float16,2304,256,7168,256,8,0,0 +int4_w4a16,torch.float16,2560,256,7168,256,8,0,0 +int4_w4a16,torch.float16,2816,256,7168,256,8,0,0 +int4_w4a16,torch.float16,3072,256,7168,256,8,0,0 +int4_w4a16,torch.float16,3328,256,7168,256,8,0,0 +int4_w4a16,torch.float16,3584,256,7168,256,8,0,0 +int4_w4a16,torch.float16,3840,256,7168,256,8,0,0 +int4_w4a16,torch.float16,4096,256,7168,256,8,0,0 +int4_w4a16,torch.float16,4608,256,7168,256,8,0,0 +int4_w4a16,torch.float16,5120,256,7168,256,8,0,0 +int4_w4a16,torch.float16,5632,256,7168,256,8,0,0 +int4_w4a16,torch.float16,6144,256,7168,256,8,0,0 +int4_w4a16,torch.float16,6656,256,7168,256,8,0,0 +int4_w4a16,torch.float16,7168,256,7168,256,8,0,0 +int4_w4a16,torch.float16,7680,256,7168,256,8,0,0 +int4_w4a16,torch.float16,8192,256,7168,256,8,0,0 +int4_w4a16,torch.float16,10240,256,7168,256,8,0,0 +int4_w4a16,torch.float16,12288,256,7168,256,8,0,0 +int4_w4a16,torch.float16,14336,256,7168,256,8,0,0 +int4_w4a16,torch.float16,16384,256,7168,256,8,0,0 +int4_w4a16,torch.float16,17408,256,7168,256,8,0,0 +int4_w4a16,torch.float16,24576,256,7168,256,8,0,0 +int4_w4a16,torch.float16,32768,256,7168,256,8,0,0 \ No newline at end of file diff --git a/aiter/configs/untuned_fmoe_asm_w4a8.csv b/aiter/configs/untuned_fmoe_asm_w4a8.csv new file mode 100644 index 0000000000000000000000000000000000000000..45b53381f293af82501f7fb524d2df9cadf2cc35 --- /dev/null +++ b/aiter/configs/untuned_fmoe_asm_w4a8.csv @@ -0,0 +1,95 @@ +quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k +int4_w4a8,torch.float16,1,256,7168,256,8,0,0 +int4_w4a8,torch.float16,2,256,7168,256,8,0,0 +int4_w4a8,torch.float16,4,256,7168,256,8,0,0 +int4_w4a8,torch.float16,6,256,7168,256,8,0,0 +int4_w4a8,torch.float16,8,256,7168,256,8,0,0 +int4_w4a8,torch.float16,10,256,7168,256,8,0,0 +int4_w4a8,torch.float16,12,256,7168,256,8,0,0 +int4_w4a8,torch.float16,14,256,7168,256,8,0,0 +int4_w4a8,torch.float16,16,256,7168,256,8,0,0 +int4_w4a8,torch.float16,20,256,7168,256,8,0,0 +int4_w4a8,torch.float16,24,256,7168,256,8,0,0 +int4_w4a8,torch.float16,28,256,7168,256,8,0,0 +int4_w4a8,torch.float16,32,256,7168,256,8,0,0 +int4_w4a8,torch.float16,36,256,7168,256,8,0,0 +int4_w4a8,torch.float16,40,256,7168,256,8,0,0 +int4_w4a8,torch.float16,44,256,7168,256,8,0,0 +int4_w4a8,torch.float16,48,256,7168,256,8,0,0 +int4_w4a8,torch.float16,56,256,7168,256,8,0,0 +int4_w4a8,torch.float16,64,256,7168,256,8,0,0 +int4_w4a8,torch.float16,72,256,7168,256,8,0,0 +int4_w4a8,torch.float16,80,256,7168,256,8,0,0 +int4_w4a8,torch.float16,96,256,7168,256,8,0,0 +int4_w4a8,torch.float16,112,256,7168,256,8,0,0 +int4_w4a8,torch.float16,128,256,7168,256,8,0,0 +int4_w4a8,torch.float16,160,256,7168,256,8,0,0 +int4_w4a8,torch.float16,192,256,7168,256,8,0,0 +int4_w4a8,torch.float16,224,256,7168,256,8,0,0 +int4_w4a8,torch.float16,256,256,7168,256,8,0,0 +int4_w4a8,torch.float16,320,256,7168,256,8,0,0 +int4_w4a8,torch.float16,384,256,7168,256,8,0,0 +int4_w4a8,torch.float16,448,256,7168,256,8,0,0 +int4_w4a8,torch.float16,512,256,7168,256,8,0,0 +int4_w4a8,torch.float16,576,256,7168,256,8,0,0 +int4_w4a8,torch.float16,640,256,7168,256,8,0,0 +int4_w4a8,torch.float16,704,256,7168,256,8,0,0 +int4_w4a8,torch.float16,768,256,7168,256,8,0,0 +int4_w4a8,torch.float16,832,256,7168,256,8,0,0 +int4_w4a8,torch.float16,896,256,7168,256,8,0,0 +int4_w4a8,torch.float16,960,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1024,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1152,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1280,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1408,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1536,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1664,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1792,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1920,256,7168,256,8,0,0 +int4_w4a8,torch.float16,2048,256,7168,256,8,0,0 +int4_w4a8,torch.float16,2304,256,7168,256,8,0,0 +int4_w4a8,torch.float16,2560,256,7168,256,8,0,0 +int4_w4a8,torch.float16,2816,256,7168,256,8,0,0 +int4_w4a8,torch.float16,3072,256,7168,256,8,0,0 +int4_w4a8,torch.float16,3328,256,7168,256,8,0,0 +int4_w4a8,torch.float16,3584,256,7168,256,8,0,0 +int4_w4a8,torch.float16,3840,256,7168,256,8,0,0 +int4_w4a8,torch.float16,4096,256,7168,256,8,0,0 +int4_w4a8,torch.float16,4608,256,7168,256,8,0,0 +int4_w4a8,torch.float16,5120,256,7168,256,8,0,0 +int4_w4a8,torch.float16,5632,256,7168,256,8,0,0 +int4_w4a8,torch.float16,6144,256,7168,256,8,0,0 +int4_w4a8,torch.float16,6656,256,7168,256,8,0,0 +int4_w4a8,torch.float16,7168,256,7168,256,8,0,0 +int4_w4a8,torch.float16,7680,256,7168,256,8,0,0 +int4_w4a8,torch.float16,8192,256,7168,256,8,0,0 +int4_w4a8,torch.float16,10240,256,7168,256,8,0,0 +int4_w4a8,torch.float16,12288,256,7168,256,8,0,0 +int4_w4a8,torch.float16,14336,256,7168,256,8,0,0 +int4_w4a8,torch.float16,16384,256,7168,256,8,0,0 +int4_w4a8,torch.float16,17408,256,7168,256,8,0,0 +int4_w4a8,torch.float16,24576,256,7168,256,8,0,0 +int4_w4a8,torch.float16,32768,256,7168,256,8,0,0 +int4_w4a8,torch.float16,1,128,7168,256,8,0,0 +int4_w4a8,torch.float16,8,128,7168,256,8,0,0 +int4_w4a8,torch.float16,32,128,7168,256,8,0,0 +int4_w4a8,torch.float16,48,128,7168,256,8,0,0 +int4_w4a8,torch.float16,64,128,7168,256,8,0,0 +int4_w4a8,torch.float16,80,128,7168,256,8,0,0 +int4_w4a8,torch.float16,96,128,7168,256,8,0,0 +int4_w4a8,torch.float16,128,128,7168,256,8,0,0 +int4_w4a8,torch.float16,256,128,7168,256,8,0,0 +int4_w4a8,torch.float16,512,128,7168,256,8,0,0 +int4_w4a8,torch.float16,768,128,7168,256,8,0,0 +int4_w4a8,torch.float16,1024,128,7168,256,8,0,0 +int4_w4a8,torch.float16,2048,128,7168,256,8,0,0 +int4_w4a8,torch.float16,3072,128,7168,256,8,0,0 +int4_w4a8,torch.float16,4096,128,7168,256,8,0,0 +int4_w4a8,torch.float16,5120,128,7168,256,8,0,0 +int4_w4a8,torch.float16,6144,128,7168,256,8,0,0 +int4_w4a8,torch.float16,8192,128,7168,256,8,0,0 +int4_w4a8,torch.float16,10240,128,7168,256,8,0,0 +int4_w4a8,torch.float16,12288,128,7168,256,8,0,0 +int4_w4a8,torch.float16,16384,128,7168,256,8,0,0 +int4_w4a8,torch.float16,24576,128,7168,256,8,0,0 +int4_w4a8,torch.float16,32768,128,7168,256,8,0,0 \ No newline at end of file diff --git a/aiter/configs/untuned_fmoe_asm_w8a8.csv b/aiter/configs/untuned_fmoe_asm_w8a8.csv new file mode 100644 index 0000000000000000000000000000000000000000..c52c9eb05f527181d54571cc8121744f379e32e6 --- /dev/null +++ b/aiter/configs/untuned_fmoe_asm_w8a8.csv @@ -0,0 +1,179 @@ +quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k +f8_w8a8_channel,torch.float16,1,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,4,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,5,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,6,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,7,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,8,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,9,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,10,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,11,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,12,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,13,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,14,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,15,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,16,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,17,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,18,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,20,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,24,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,28,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,32,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,34,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,36,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,40,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,44,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,48,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,56,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,64,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,68,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,72,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,80,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,88,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,96,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,104,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,112,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,128,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,144,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,160,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,192,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,224,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,256,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,320,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,384,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,448,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,512,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,576,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,640,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,704,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,768,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,832,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,896,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,960,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1024,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1152,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1280,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1408,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1536,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1664,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1792,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1920,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2048,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2304,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2560,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2816,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3072,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3328,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3584,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3840,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,4096,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,4608,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,5120,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,5632,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,6144,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,6656,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,7168,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,7680,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,8192,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,10240,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,12288,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,14336,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,16384,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,17408,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,24576,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,32768,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,40960,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,49152,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,57344,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,65536,128,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,4,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,5,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,6,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,7,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,8,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,9,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,10,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,11,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,12,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,13,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,14,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,15,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,16,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,17,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,18,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,20,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,24,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,28,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,32,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,34,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,36,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,40,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,44,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,48,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,56,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,64,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,68,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,72,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,80,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,88,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,96,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,104,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,112,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,128,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,144,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,160,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,192,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,224,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,256,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,320,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,384,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,448,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,512,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,576,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,640,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,704,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,768,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,832,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,896,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,960,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1024,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1152,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1280,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1408,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1536,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1664,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1792,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,1920,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2048,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2304,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2560,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,2816,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3072,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3328,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3584,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,3840,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,4096,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,4608,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,5120,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,5632,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,6144,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,6656,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,7168,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,7680,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,8192,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,10240,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,12288,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,14336,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,16384,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,17408,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,24576,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,32768,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,40960,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,49152,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,57344,256,7168,256,8,0,0 +f8_w8a8_channel,torch.float16,65536,256,7168,256,8,0,0 \ No newline at end of file diff --git a/aiter/configs/untuned_fmoe_asm_w8a8_group.csv b/aiter/configs/untuned_fmoe_asm_w8a8_group.csv new file mode 100644 index 0000000000000000000000000000000000000000..dac6e4e3bdcc1171ca3c5fce9b85c720661466a2 --- /dev/null +++ b/aiter/configs/untuned_fmoe_asm_w8a8_group.csv @@ -0,0 +1,179 @@ +quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k +f8_w8a8_block,torch.float16,1,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,4,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,5,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,6,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,7,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,8,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,9,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,10,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,11,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,12,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,13,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,14,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,15,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,16,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,17,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,18,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,20,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,24,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,28,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,32,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,34,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,36,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,40,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,44,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,48,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,56,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,64,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,68,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,72,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,80,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,88,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,96,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,104,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,112,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,128,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,144,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,160,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,192,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,224,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,256,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,320,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,384,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,448,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,512,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,576,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,640,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,704,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,768,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,832,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,896,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,960,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1024,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1152,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1280,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1408,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1536,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1664,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1792,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1920,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2048,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2304,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2560,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2816,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3072,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3328,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3584,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3840,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,4096,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,4608,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,5120,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,5632,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,6144,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,6656,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,7168,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,7680,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,8192,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,10240,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,12288,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,14336,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,16384,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,17408,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,24576,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,32768,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,40960,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,49152,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,57344,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,65536,128,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,4,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,5,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,6,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,7,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,8,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,9,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,10,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,11,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,12,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,13,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,14,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,15,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,16,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,17,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,18,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,20,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,24,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,28,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,32,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,34,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,36,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,40,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,44,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,48,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,56,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,64,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,68,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,72,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,80,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,88,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,96,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,104,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,112,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,128,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,144,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,160,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,192,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,224,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,256,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,320,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,384,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,448,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,512,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,576,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,640,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,704,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,768,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,832,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,896,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,960,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1024,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1152,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1280,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1408,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1536,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1664,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1792,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,1920,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2048,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2304,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2560,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,2816,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3072,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3328,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3584,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,3840,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,4096,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,4608,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,5120,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,5632,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,6144,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,6656,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,7168,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,7680,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,8192,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,10240,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,12288,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,14336,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,16384,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,17408,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,24576,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,32768,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,40960,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,49152,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,57344,256,7168,256,8,0,0 +f8_w8a8_block,torch.float16,65536,256,7168,256,8,0,0 \ No newline at end of file diff --git a/aiter/configs/untuned_fmoe_ck.csv b/aiter/configs/untuned_fmoe_ck.csv new file mode 100644 index 0000000000000000000000000000000000000000..c68acc6129672caea75dd8224866ae038160444e --- /dev/null +++ b/aiter/configs/untuned_fmoe_ck.csv @@ -0,0 +1,32 @@ +quant_type,indtype,token,inter_dim,model_dim,expert,topk,q_size_n,q_size_k +no_quant,torch.float16,1,256,8192,256,4,0,0 +no_quant,torch.float16,8,256,8192,256,4,0,0 +no_quant,torch.float16,16,256,8192,256,4,0,0 +no_quant,torch.float16,24,256,8192,256,4,0,0 +no_quant,torch.float16,32,256,8192,256,4,0,0 +no_quant,torch.float16,48,256,8192,256,4,0,0 +no_quant,torch.float16,64,256,8192,256,4,0,0 +no_quant,torch.float16,96,256,8192,256,4,0,0 +no_quant,torch.float16,128,256,8192,256,4,0,0 +no_quant,torch.float16,256,256,8192,256,4,0,0 +no_quant,torch.float16,512,256,8192,256,4,0,0 +no_quant,torch.float16,1024,256,8192,256,4,0,0 +no_quant,torch.float16,2048,256,8192,256,4,0,0 +no_quant,torch.float16,4096,256,8192,256,4,0,0 +no_quant,torch.float16,8192,256,8192,256,4,0,0 + +no_quant,torch.float16,1,256,7168,256,8,0,0 +no_quant,torch.float16,8,256,7168,256,8,0,0 +no_quant,torch.float16,16,256,7168,256,8,0,0 +no_quant,torch.float16,24,256,7168,256,8,0,0 +no_quant,torch.float16,32,256,7168,256,8,0,0 +no_quant,torch.float16,48,256,7168,256,8,0,0 +no_quant,torch.float16,64,256,7168,256,8,0,0 +no_quant,torch.float16,96,256,7168,256,8,0,0 +no_quant,torch.float16,128,256,7168,256,8,0,0 +no_quant,torch.float16,256,256,7168,256,8,0,0 +no_quant,torch.float16,512,256,7168,256,8,0,0 +no_quant,torch.float16,1024,256,7168,256,8,0,0 +no_quant,torch.float16,2048,256,7168,256,8,0,0 +no_quant,torch.float16,4096,256,7168,256,8,0,0 +no_quant,torch.float16,8192,256,7168,256,8,0,0 diff --git a/aiter/configs/untuned_gemm.csv b/aiter/configs/untuned_gemm.csv new file mode 100644 index 0000000000000000000000000000000000000000..014539e55e1e34dac9740a367b3164fb4bcc2506 --- /dev/null +++ b/aiter/configs/untuned_gemm.csv @@ -0,0 +1 @@ +M,N,K,bias,dtype,outdtype,scaleAB diff --git a/aiter/dist/__init__.py b/aiter/dist/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/dist/communication_op.py b/aiter/dist/communication_op.py new file mode 100644 index 0000000000000000000000000000000000000000..63ea6c0aaf2255718ddc49285bbabe20afd541af --- /dev/null +++ b/aiter/dist/communication_op.py @@ -0,0 +1,61 @@ +""" +* Copyright (C) 2024-2025, The vLLM team. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +""" + +from typing import Any, Dict, Optional, Union + +import torch +import torch.distributed + +from .parallel_state import get_tp_group + + +def tensor_model_parallel_all_reduce( + input_: torch.Tensor, open_fp8_quant: bool = False +) -> torch.Tensor: + """All-reduce the input tensor across model parallel group.""" + return get_tp_group().all_reduce(input_, open_fp8_quant) + + +def tensor_model_parallel_fused_allreduce_rmsnorm( + input_: torch.Tensor, residual_inp_: torch.Tensor, weight_: torch.Tensor, eps: float +) -> tuple[torch.Tensor, torch.Tensor]: + return get_tp_group().fused_allreduce_rmsnorm(input_, residual_inp_, weight_, eps) + + +def tensor_model_parallel_custom_all_gather(input_: torch.Tensor) -> torch.Tensor: + return get_tp_group().custom_all_gather(input_) + + +def tensor_model_parallel_all_gather( + input_: torch.Tensor, use_custom: bool = False, dim: int = -1 +) -> torch.Tensor: + """All-gather the input tensor across model parallel group.""" + return get_tp_group().all_gather(input_, use_custom, dim) + + +def tensor_model_parallel_gather( + input_: torch.Tensor, dst: int = 0, dim: int = -1 +) -> Optional[torch.Tensor]: + """Gather the input tensor across model parallel group.""" + return get_tp_group().gather(input_, dst, dim) + + +def broadcast_tensor_dict( + tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0 +): + if not torch.distributed.is_initialized(): + return tensor_dict + return get_tp_group().broadcast_tensor_dict(tensor_dict, src) diff --git a/aiter/dist/cuda_wrapper.py b/aiter/dist/cuda_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..7d85778e40c548bef50b99ef364686b538477886 --- /dev/null +++ b/aiter/dist/cuda_wrapper.py @@ -0,0 +1,187 @@ +''' + * Copyright (c) 2024, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ''' + +"""This file is a pure Python wrapper for the cudart library. +It avoids the need to compile a separate shared library, and is +convenient for use when we just need to call a few functions. +""" + +import ctypes +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +# this line makes it possible to directly load `libcudart.so` using `ctypes` +import torch # noqa + +# from vllm.logger import init_logger + +# logger = init_logger(__name__) + +# === export types and functions from cudart to Python === +# for the original cudart definition, please check + +cudaError_t = ctypes.c_int +cudaMemcpyKind = ctypes.c_int + + +class cudaIpcMemHandle_t(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 128)] + + +@dataclass +class Function: + name: str + restype: Any + argtypes: List[Any] + + +def find_loaded_library(lib_name) -> Optional[str]: + """ + According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, + the file `/proc/self/maps` contains the memory maps of the process, which includes the + shared libraries loaded by the process. We can use this file to find the path of the + a loaded library. + """ # noqa + found = False + with open("/proc/self/maps") as f: + for line in f: + if lib_name in line: + found = True + break + if not found: + # the library is not loaded in the current process + return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 + start = line.index("/") + path = line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), \ + f"Unexpected filename: {filename} for library {lib_name}" + return path + + +class CudaRTLibrary: + exported_functions = [ + # ​cudaError_t cudaSetDevice ( int device ) + Function("cudaSetDevice", cudaError_t, [ctypes.c_int]), + # cudaError_t cudaDeviceSynchronize ( void ) + Function("cudaDeviceSynchronize", cudaError_t, []), + # ​cudaError_t cudaDeviceReset ( void ) + Function("cudaDeviceReset", cudaError_t, []), + + # const char* cudaGetErrorString ( cudaError_t error ) + Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]), + + # ​cudaError_t cudaMalloc ( void** devPtr, size_t size ) + Function("cudaMalloc", cudaError_t, + [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]), + # ​cudaError_t cudaFree ( void* devPtr ) + Function("cudaFree", cudaError_t, [ctypes.c_void_p]), + # ​cudaError_t cudaMemset ( void* devPtr, int value, size_t count ) + Function("cudaMemset", cudaError_t, + [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]), + # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa + Function("cudaMemcpy", cudaError_t, [ + ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind + ]), + + # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa + Function("cudaIpcGetMemHandle", cudaError_t, + [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]), + # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags ) # noqa + Function("cudaIpcOpenMemHandle", cudaError_t, [ + ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint + ]), + ] + + # class attribute to store the mapping from the path to the library + # to avoid loading the same library multiple times + path_to_library_cache: Dict[str, Any] = {} + + # class attribute to store the mapping from library path + # to the corresponding dictionary + path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} + + def __init__(self, so_file: Optional[str] = None): + if so_file is None: + so_file = find_loaded_library("libcudart") + assert so_file is not None, \ + "libcudart is not loaded in the current process" + if so_file not in CudaRTLibrary.path_to_library_cache: + lib = ctypes.CDLL(so_file) + CudaRTLibrary.path_to_library_cache[so_file] = lib + self.lib = CudaRTLibrary.path_to_library_cache[so_file] + + if so_file not in CudaRTLibrary.path_to_dict_mapping: + _funcs = {} + for func in CudaRTLibrary.exported_functions: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs + self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file] + + def CUDART_CHECK(self, result: cudaError_t) -> None: + if result != 0: + error_str = self.cudaGetErrorString(result) + raise RuntimeError(f"CUDART error: {error_str}") + + def cudaGetErrorString(self, error: cudaError_t) -> str: + return self.funcs["cudaGetErrorString"](error).decode("utf-8") + + def cudaSetDevice(self, device: int) -> None: + self.CUDART_CHECK(self.funcs["cudaSetDevice"](device)) + + def cudaDeviceSynchronize(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]()) + + def cudaDeviceReset(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceReset"]()) + + def cudaMalloc(self, size: int) -> ctypes.c_void_p: + devPtr = ctypes.c_void_p() + self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size)) + return devPtr + + def cudaFree(self, devPtr: ctypes.c_void_p) -> None: + self.CUDART_CHECK(self.funcs["cudaFree"](devPtr)) + + def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, + count: int) -> None: + self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count)) + + def cudaMemcpy(self, dst: ctypes.c_void_p, src: ctypes.c_void_p, + count: int) -> None: + cudaMemcpyDefault = 4 + kind = cudaMemcpyDefault + self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind)) + + def cudaIpcGetMemHandle(self, + devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t: + handle = cudaIpcMemHandle_t() + self.CUDART_CHECK(self.funcs["cudaIpcGetMemHandle"]( + ctypes.byref(handle), devPtr)) + return handle + + def cudaIpcOpenMemHandle(self, + handle: cudaIpcMemHandle_t) -> ctypes.c_void_p: + cudaIpcMemLazyEnablePeerAccess = 1 + devPtr = ctypes.c_void_p() + self.CUDART_CHECK(self.funcs["cudaIpcOpenMemHandle"]( + ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess)) + return devPtr diff --git a/aiter/dist/custom_all_reduce.py b/aiter/dist/custom_all_reduce.py new file mode 100644 index 0000000000000000000000000000000000000000..7dede82d55b8755c89af002bf6b03f0eac335b65 --- /dev/null +++ b/aiter/dist/custom_all_reduce.py @@ -0,0 +1,329 @@ +''' + * Copyright (c) 2024, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ''' + +from contextlib import contextmanager +from typing import Any, List, Optional, Union + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +# import vllm.envs as envs +# from vllm import _custom_ops as ops +import aiter as ops +import os +from .custom_all_reduce_utils import ( + gpu_p2p_access_check) +from .parallel_state import in_the_same_node_as +from aiter import logger + +try: + ops.meta_size() + custom_ar = True +except Exception: + # For CPUs + custom_ar = False + + +def _can_p2p(rank: int, world_size: int) -> bool: + for i in range(world_size): + if i == rank: + continue + if not gpu_p2p_access_check(rank, i): + return False + return True + + +def is_weak_contiguous(inp: torch.Tensor): + return inp.is_contiguous() or (inp.storage().nbytes() - + inp.storage_offset() * inp.element_size() + == inp.numel() * inp.element_size()) + + +class CustomAllreduce: + + _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8] + + # max_size: max supported allreduce size + def __init__(self, + group: ProcessGroup, + device: Union[int, str, torch.device], + max_size=8192 * 1024 * 8) -> None: + """ + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the CustomAllreduce to. If None, + it will be bind to f"cuda:{local_rank}". + It is the caller's responsibility to make sure each communicator + is bind to a unique device, and all communicators in this group + are in the same node. + """ + self._IS_CAPTURING = False + self.disabled = True + + if not custom_ar: + # disable because of missing custom allreduce library + # e.g. in a non-cuda environment + return + + self.group = group + + assert dist.get_backend(group) != dist.Backend.NCCL, ( + "CustomAllreduce should be attached to a non-NCCL group.") + + if not all(in_the_same_node_as(group, source_rank=0)): + # No need to initialize custom allreduce for multi-node case. + logger.warning( + "Custom allreduce is disabled because this process group" + " spans across nodes.") + return + + rank = dist.get_rank(group=self.group) + world_size = dist.get_world_size(group=self.group) + if world_size == 1: + # No need to initialize custom allreduce for single GPU case. + return + + if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES: + logger.warning( + "Custom allreduce is disabled due to an unsupported world" + " size: %d. Supported world sizes: %s. To silence this " + "warning, specify disable_custom_all_reduce=True explicitly.", + world_size, str(CustomAllreduce._SUPPORTED_WORLD_SIZES)) + return + + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + # now `device` is a `torch.device` object + assert isinstance(device, torch.device) + self.device = device + + cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "-1") + # cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + if cuda_visible_devices: + device_ids = list(map(int, cuda_visible_devices.split(","))) + else: + from vllm.utils import cuda_device_count_stateless + device_ids = list(range(cuda_device_count_stateless())) + + physical_device_id = device_ids[device.index] + tensor = torch.tensor([physical_device_id], + dtype=torch.int, + device="cpu") + gather_list = [ + torch.tensor([0], dtype=torch.int, device="cpu") + for _ in range(world_size) + ] + dist.all_gather(gather_list, tensor, group=self.group) + physical_device_ids = [t.item() for t in gather_list] + + # test nvlink first, this will filter out most of the cases + # where custom allreduce is not supported + # this checks hardware and driver support for NVLink + # assert current_platform.is_cuda() or current_platform.is_rocm() + # full_nvlink = current_platform.is_full_nvlink(physical_device_ids) + full_nvlink = True + if world_size > 2 and not full_nvlink: + logger.warning( + "Custom allreduce is disabled because it's not supported on" + " more than two PCIe-only GPUs. To silence this warning, " + "specify disable_custom_all_reduce=True explicitly.") + return + # test P2P capability, this checks software/cudaruntime support + # this is expensive to compute at the first time + # then we cache the result + # On hygon GPU, p2p is always enabled between XGMI connected GPUs + # if not current_platform.is_rocm() and not _can_p2p(rank, world_size): + # logger.warning( + # "Custom allreduce is disabled because your platform lacks " + # "GPU P2P capability or P2P test failed. To silence this " + # "warning, specify disable_custom_all_reduce=True explicitly.") + # return + + self.disabled = False + # buffers memory are owned by this Python class and passed to C++ + # meta data composes of two parts: meta data for synchronization + # (256 bytes) and a temporary buffer for storing intermediate + # allreduce results. + # if current_platform.is_rocm(): + if 1: + self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size) + else: + self.meta = torch.zeros(ops.meta_size() + max_size, + dtype=torch.uint8, + device=self.device) + # This is a pre-registered IPC buffer. In eager mode, input tensors + # are first copied into this buffer before allreduce is performed + self.buffer = torch.empty(max_size, + dtype=torch.uint8, + device=self.device) + # This is a buffer for storing the tuples of pointers pointing to + # IPC buffers from all ranks. Each registered tuple has size of + # 8*world_size bytes where world_size is at most 8. Allocating 8MB + # is enough for 131072 such tuples. The largest model I've seen only + # needs less than 10000 of registered tuples. + self.rank_data = torch.empty(8 * 1024 * 1024, + dtype=torch.uint8, + device=self.device) + self.max_size = max_size + self.rank = rank + self.world_size = world_size + # if current_platform.is_rocm(): + if 1: + # _share_cuda_() doesn't accept meta buffer not allocated from + # PyTorch cache allocator, use direct HIP call to get IPC handle + handle = ops.get_meta_buffer_ipc_handle(self.meta) + shard_data = ( + bytes(handle), # ipc handle to base ptr + 0, # offset of base ptr + ) + handles, offsets = self._gather_ipc_meta(shard_data) + else: + handles, offsets = self._get_ipc_meta(self.meta) + self.full_nvlink = full_nvlink + self._ptr = ops.init_custom_ar(self.meta, self.rank_data, handles, + offsets, rank, self.full_nvlink) + self.register_buffer(self.buffer) + + @contextmanager + def capture(self): + """ + The main responsibility of this context manager is the + `register_graph_buffers` call at the end of the context. + It records all the buffer addresses used in the CUDA graph. + """ + try: + self._IS_CAPTURING = True + yield + finally: + self._IS_CAPTURING = False + if not self.disabled: + self.register_graph_buffers() + + def _get_ipc_meta(self, inp: torch.Tensor): + # if current_platform.is_rocm(): + if 1: + # _share_cuda_() doesn't accept meta buffer not allocated from + # PyTorch cache allocator, use direct HIP call to get IPC handle + handle = ops.get_meta_buffer_ipc_handle(inp) + shard_data = ( + bytes(handle), # ipc handle to base ptr + 0, # offset of base ptr + ) + else: + data = inp.untyped_storage()._share_cuda_() + shard_data = ( + data[1], # ipc handle to base ptr + data[3], # offset of base ptr + ) + return self._gather_ipc_meta(shard_data) + + def _gather_ipc_meta(self, shard_data): + # Note: don't use `[[None]] * self.world_size` here + # because it will create a list of the same reference + all_data: List[Optional[Any]] = [[None] + for i in range(self.world_size)] + all_data[self.rank][0] = shard_data + + ranks = dist.get_process_group_ranks(group=self.group) + ranks.sort() + for i, rank in enumerate(ranks): + dist.broadcast_object_list(all_data[i], + src=rank, + group=self.group, + device="cpu") + + # we cannot directly use `dist.all_gather_object` here + # because it is incompatible with `gloo` backend under inference mode. + # see https://github.com/pytorch/pytorch/issues/126032 for details. + + handles = [] + offsets = [] + for i in range(len(all_data)): + handles.append(all_data[i][0][0]) # type: ignore + offsets.append(all_data[i][0][1]) # type: ignore + return handles, offsets + + def register_buffer(self, inp: torch.Tensor): + handles, offsets = self._get_ipc_meta(inp) + ops.register_buffer(self._ptr, inp, handles, offsets) + + def register_graph_buffers(self): + handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) + handles, offsets = self._gather_ipc_meta((bytes(handle), offset)) + logger.info("Registering %d cuda graph addresses", len(offset)) + ops.register_graph_buffers(self._ptr, handles, offsets) + + def should_custom_ar(self, inp: torch.Tensor): + if self.disabled: + return False + inp_size = inp.numel() * inp.element_size() + # custom allreduce requires input byte size to be multiples of 16 + if inp_size % 16 != 0: + return False + if not is_weak_contiguous(inp): + return False + # for 4 or more non NVLink-capable GPUs, custom allreduce provides + # little performance improvement over NCCL. + if self.world_size == 2 or self.full_nvlink: + return inp_size < self.max_size + return False + + # all reduce, assuming inp tensor is IPC registered with register_buffer, + # or, in the context of cuda graphs, register_graph_buffers + def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty_like(inp) + ops.all_reduce_reg(self._ptr, inp, out) + return out + + # all reduce, assuming inp tensor is NOT IPC registered + def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty_like(inp) + ops.all_reduce_unreg(self._ptr, inp, self.buffer, out) + return out + + def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]: + # when custom allreduce is disabled, this will be None + if self.disabled or not self.should_custom_ar(input): + return None + if self._IS_CAPTURING: + if torch.cuda.is_current_stream_capturing(): + return self.all_reduce_reg(input) + else: + # if warm up, mimic the allocation pattern + # since custom allreduce is out-of-place + return torch.empty_like(input) + else: + # note: outside of cuda graph context, + # custom allreduce incurs a cost of cudaMemcpy, which should + # be small(<=1% of overall latency) compared to the performance + # gains of using custom kernels + return self.all_reduce_unreg(input) + + return None + + def close(self): + if not self.disabled and self._ptr: + ops.dispose(self._ptr) + self._ptr = 0 + + def __del__(self): + self.close() diff --git a/aiter/dist/custom_all_reduce_utils.py b/aiter/dist/custom_all_reduce_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fbc96ff7b69273e0277c674b476f7c4aa1837006 --- /dev/null +++ b/aiter/dist/custom_all_reduce_utils.py @@ -0,0 +1,271 @@ +''' + * Copyright (c) 2024, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ''' + +from functools import lru_cache, partial, wraps +from .utils import (cuda_device_count_stateless, + update_environment_variables) +from aiter import logger +from .cuda_wrapper import CudaRTLibrary +import ctypes +import json +import os +import pickle +import subprocess +import sys +import tempfile +from itertools import product +from typing import Dict, List, Optional, Sequence + +import torch.distributed as dist +import torch.multiprocessing as mp + +VLLM_CACHE_ROOT = os.path.expanduser("~/.cache/vllm") + + +def producer(batch_src: Sequence[int], + producer_queue, + consumer_queue, + result_queue, + cuda_visible_devices: Optional[str] = None): + if cuda_visible_devices is not None: + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) + + lib = CudaRTLibrary() + for i in batch_src: + lib.cudaSetDevice(i) + pointer = lib.cudaMalloc(1024) + lib.cudaMemset(pointer, 1, 1024) + lib.cudaDeviceSynchronize() + handle = lib.cudaIpcGetMemHandle(pointer) + producer_queue.put(handle) + open_success = consumer_queue.get() + if open_success: + # use two queues to simulate barrier + producer_queue.put(0) + consumer_queue.get() + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def consumer(batch_tgt: Sequence[int], + producer_queue, + consumer_queue, + result_queue, + cuda_visible_devices: Optional[str] = None): + if cuda_visible_devices is not None: + update_environment_variables( + {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) + + lib = CudaRTLibrary() + for j in batch_tgt: + lib.cudaSetDevice(j) + handle = producer_queue.get() + open_success = False + try: + pointer = lib.cudaIpcOpenMemHandle(handle) # type: ignore + open_success = True + except RuntimeError: + # cannot error out here, because the producer process + # is still waiting for the response. + pass + consumer_queue.put(open_success) + if open_success: + # modify the memory + lib.cudaMemset(pointer, 2, 1024) + lib.cudaDeviceSynchronize() + # use two queues to simulate barrier + producer_queue.get() + consumer_queue.put(0) + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def can_actually_p2p( + batch_src: Sequence[int], + batch_tgt: Sequence[int], +) -> Sequence[bool]: + """ + Usually, checking if P2P access is enabled can be done by + `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes + the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)` + returns `True` even if P2P access is not actually possible. + See https://github.com/vllm-project/vllm/issues/2728 and + Therefore, we have to perform a real P2P access to check if it is actually + possible. + + Note on p2p and cuda IPC: + Usually, one process uses one GPU: + GPU src --> cuda context src --> tensor src --> process src + + We need to combine p2p and cuda IPC, so that: + GPU src --> cuda context src --> tensor src --> process src + |shared| + GPU tgt --> cuda context tgt --> tensor tgt --> process tgt + That is to say, process src creates a tensor in GPU src, passes IPC handle to + process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the + tensor in process tgt will be reflected in the tensor in process src, because + they are the same memory segment. + It is important to note that process tgt accesses the tensor in GPU tgt, not + GPU src. That's why we need p2p access. + + The most time-consuming part is the process creation. To avoid creating + processes for every pair of GPUs, we use batched testing. We create two + processes for testing all pairs of GPUs in batch. The trick is to reset + the device after each test (which is not available in PyTorch). + """ # noqa + cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "-1") + # cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + # pass the CUDA_VISIBLE_DEVICES to the child process + # to make sure they see the same set of GPUs + + # make sure the processes are spawned + smp = mp.get_context("spawn") + producer_queue = smp.Queue() + consumer_queue = smp.Queue() + result_queue = smp.Queue() + p_src = smp.Process(target=producer, + args=(batch_src, producer_queue, consumer_queue, + result_queue, cuda_visible_devices)) + p_tgt = smp.Process(target=consumer, + args=(batch_tgt, producer_queue, consumer_queue, + result_queue, cuda_visible_devices)) + p_src.start() + p_tgt.start() + p_src.join() + p_tgt.join() + assert p_src.exitcode == 0 and p_tgt.exitcode == 0 + result: List[bool] = [] + for src, tgt in zip(batch_src, batch_tgt): + a = result_queue.get() + b = result_queue.get() + if a != b: + logger.warning( + "Two processes do not agree on the P2P access" + " status on %d -> %d, treat as disabled.", src, tgt) + result.append(False) + else: + result.append(a) + return result + + +# why do we need this cache? +# we are testing peer-to-peer (p2p) access between GPUs,across processes. +# if we test it every time, it will be very slow, because we need to create +# N * N * 2 processes, where N is the world size. This is very slow. +# to reduce the time, we use a cache file to store the p2p access status. +# the cache file is generated by the master process if it does not exist. +# then all the processes can read the cache file to check the p2p access status. +# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we +# can have different cache files for different CUDA_VISIBLE_DEVICES settings, +# e.g. used by different vllm engines. The device id in the cache file is a +# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number +# of visible devices in the vllm engine. +_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None + + +def gpu_p2p_access_check(src: int, tgt: int) -> bool: + """Check if GPU src can access GPU tgt.""" + + # if the cache variable is already calculated, + # read from the cache instead of checking it again + global _gpu_p2p_access_cache + if _gpu_p2p_access_cache is not None: + return _gpu_p2p_access_cache[f"{src}->{tgt}"] + + is_distributed = dist.is_initialized() + + num_dev = cuda_device_count_stateless() + cuda_visible_devices = int(os.environ.get("CUDA_VISIBLE_DEVICES", "-1")) + # cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + if cuda_visible_devices is None: + cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) + + path = os.path.join( + VLLM_CACHE_ROOT, + f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json") + os.makedirs(os.path.dirname(path), exist_ok=True) + from vllm.distributed.parallel_state import get_world_group + if ((not is_distributed or get_world_group().local_rank == 0) + and (not os.path.exists(path))): + # only the local master process (with local_rank == 0) can + # enter this block to calculate the cache + logger.info("generating GPU P2P access cache in %s", path) + cache: Dict[str, bool] = {} + ids = list(range(num_dev)) + # batch of all pairs of GPUs + batch_src, batch_tgt = zip(*list(product(ids, ids))) + # NOTE: we use `subprocess` rather than `multiprocessing` here + # because the caller might not have `if __name__ == "__main__":`, + # in that case we cannot use spawn method in multiprocessing. + # However, `can_actually_p2p` requires spawn method. + # The fix is, we use `subprocess` to call the function, + # where we have `if __name__ == "__main__":` in this file. + + # use a temporary file to store the result + # we don't use the output of the subprocess directly, + # because the subprocess might produce logging output + with tempfile.NamedTemporaryFile() as output_file: + input_bytes = pickle.dumps( + (batch_src, batch_tgt, output_file.name)) + returned = subprocess.run([sys.executable, __file__], + input=input_bytes, + capture_output=True) + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError( + f"Error happened when batch testing " + f"peer-to-peer access from {batch_src} to {batch_tgt}:\n" + f"{returned.stderr.decode()}") from e + with open(output_file.name, "rb") as f: + result = pickle.load(f) + for _i, _j, r in zip(batch_src, batch_tgt, result): + cache[f"{_i}->{_j}"] = r + with open(path, "w") as f: + json.dump(cache, f, indent=4) + if is_distributed: + get_world_group().barrier() + logger.info("reading GPU P2P access cache from %s", path) + with open(path, "r") as f: + cache = json.load(f) + _gpu_p2p_access_cache = cache + return _gpu_p2p_access_cache[f"{src}->{tgt}"] + + +__all__ = ["gpu_p2p_access_check"] + +if __name__ == "__main__": + batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read()) + result = can_actually_p2p(batch_src, batch_tgt) + with open(output_file, "wb") as f: + f.write(pickle.dumps(result)) diff --git a/aiter/dist/custom_gemm_allreduce.py b/aiter/dist/custom_gemm_allreduce.py new file mode 100644 index 0000000000000000000000000000000000000000..92a70cfeea329a2a9737bf41d34cbeb4bb8fac73 --- /dev/null +++ b/aiter/dist/custom_gemm_allreduce.py @@ -0,0 +1,234 @@ +''' + * Copyright (c) 2024, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ''' + +from contextlib import contextmanager +from typing import Any, List, Optional, Union + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup +from aiter.dist.parallel_state import GroupCoordinator + +from aiter.ops.triton.gemm_allreduce_a16w4 import ( + gemm_allreduce_a16w4, +) + +from aiter.ops.triton.gemm_allreduce_w8a8 import ( + gemm_allreduce_w8a8, + gemm_allreduce_w8a8_v2, +) + +# use rocSHMEM +USE_NVSHMEM = False + +if USE_NVSHMEM: + import pynvshmem + from pynvshmem import nvshmem_create_tensor_list_intra_node as shm_create_tensor_list + from pynvshmem import nvshmem_create_tensor as shm_create_tensor +else: + import pyrocshmem + from pyrocshmem import rocshmem_create_tensor_list_intra_node as shm_create_tensor_list + from pyrocshmem import rocshmem_create_tensor as shm_create_tensor + +import aiter as ops +import os +from .custom_all_reduce_utils import ( + gpu_p2p_access_check) +from .parallel_state import in_the_same_node_as +from aiter import logger + +def _can_p2p(rank: int, world_size: int) -> bool: + for i in range(world_size): + if i == rank: + continue + if not gpu_p2p_access_check(rank, i): + return False + return True + +# NOTE: +QUANTIZATION_METHOD_NAMES = [ + "awq", + # TODO: support other types +] + +def cdiv(a: int, b: int) -> int: + return (a + b - 1) // b + +class CustomGemmAllreduce: + + _SUPPORTED_QUANTIZATION_METHOD = ["awq", "blockwise_int8", "fp8"] + + # max_size: max supported allreduce size + def __init__(self, + tp: GroupCoordinator, + pp: GroupCoordinator, + #group: ProcessGroup, + #local_world_size: int, + quant_method: str, + output_size: int, + min_block_size_m: Optional[int] = 16, + min_block_size_n: Optional[int] = 16, + max_seq_len: Optional[int] = 256) -> None: + """ + Args: + group: the process group to work on. If None, it will use the + default process group. + output_size: second dimension of matrix A. (N) + It is the caller's responsibility to make sure each communicator + is bind to a unique device, and all communicators in this group + are in the same node. + """ + self.disabled = True + + if quant_method not in CustomGemmAllreduce._SUPPORTED_QUANTIZATION_METHOD: + logger.warning( + "Custom gemm-allreduce is disabled due to an unsupported quant method" + ": %s. Supported quant methods: %s", + quant_method, str(CustomGemmAllreduce._SUPPORTED_QUANTIZATION_METHOD)) + return + + self.tp = tp + self.pp = pp + self.world_size = tp.world_size + self.cur_rank = tp.rank_in_group + # self.local_world_size = local_world_size + # self.nnodes = self.world_size // self.local_world_size + self.max_seq_len = max_seq_len + self.quant_method = quant_method + self.min_block_size_m = min_block_size_m + self.min_block_size_n = min_block_size_n + + # shmem buffer + try: + if USE_NVSHMEM: + pynvshmem.init_nvshmem_by_uniqueid(tp.device_group) + self.rocshmem_ctx = None + else: + start_rank = pp.rank_in_group * tp.world_size + pyrocshmem.init_rocshmem_by_uniqueid(start_rank, tp.device_group) + self.rocshmem_ctx = pyrocshmem.rocshmemx_get_device_ctx_cached() + except Exception as e: + logger.exception(f"Error in request: {e}") + logger.warning( + "pyrocshmem init failed") + return + + # NOTE: make sure that this is the minimum block size in awq-gemm configs + # this size will be checked also in gemm_allreduce_a16w4 + self.max_tiles = \ + cdiv(max_seq_len, min_block_size_m) * cdiv(output_size, min_block_size_n) + + # TODO: group the return vals by (bufs), (buf) + self.barriers_buf, self.scatters_buf, self.reduces_buf, self.barrier_buf, self.scatter_buf, self.reduce_buf = \ + self.register_shmem(output_size) + + if self.barriers_buf is None or self.scatters_buf is None: + logger.warning( + "Custom gemm-allreduce register shmem failed, quant_method" + ": %s. output_size: %d", + quant_method, output_size) + return + + self.disabled = False + + # NOTE: this func/args are specific according to kernel design + def register_shmem( + self, + output_size: int, # N + ) -> List[torch.Tensor]: + + # barrier buffer + barrier_bufs = shm_create_tensor_list((self.max_tiles, ), dtype=torch.int32) + barrier_bufs[self.cur_rank].zero_() + barriers_buf = torch.tensor([t.data_ptr() for t in barrier_bufs], device=barrier_bufs[self.cur_rank].device) + # barriers_buf = shm_create_tensor((self.max_tiles, ), dtype=torch.int32) + # barriers_buf.zero_() + + # scatter buffer + scatter_bufs = shm_create_tensor_list([self.max_seq_len, output_size], dtype=torch.float32) + scatters_buf = torch.tensor([t.data_ptr() for t in scatter_bufs], device=scatter_bufs[self.cur_rank].device) + # scatters_buf = shm_create_tensor([self.max_seq_len, output_size], dtype=torch.float32) + + # reduce_buffer + # output_size_per_rank = cdiv(output_size, self.local_world_size) + reduce_bufs = shm_create_tensor_list([self.max_seq_len, output_size], dtype=torch.float16) + reduces_buf = torch.tensor([t.data_ptr() for t in reduce_bufs], device=scatter_bufs[self.cur_rank].device) + + return [barriers_buf, scatters_buf, reduces_buf, barrier_bufs[self.cur_rank], scatter_bufs[self.cur_rank], reduce_bufs[self.cur_rank]] + + def close(self): + if not self.disabled: + if self.barriers_buf is not None: + del self.barriers_buf + if self.scatters_buf is not None: + del self.scatters_buf + + def __del__(self): + self.close() + + def gemm_allreduce( + self, + input: torch.tensor, + *args, + ): + if self.quant_method == "awq": + qweight, scales, qzeros = args + return gemm_allreduce_a16w4( + input, qweight, scales, qzeros, + self.barriers_buf, self.scatters_buf, + self.max_tiles, + self.group, + self.rocshmem_ctx) + elif self.quant_method == "blockwise_int8" or self.quant_method == "fp8": + weight, x_scale, w_scale, block_size, dtype = args + # return gemm_allreduce_w8a8( + # input, weight, x_scale, w_scale, block_size, dtype, + # self.barriers_buf, self.scatters_buf, + # self.barrier_buf, self.scatter_buf, + # self.max_tiles, + # self.group, + # self.local_world_size, + # self.rocshmem_ctx) + return gemm_allreduce_w8a8_v2( + input, weight, x_scale, w_scale, block_size, dtype, + self.barriers_buf, self.scatters_buf, self.reduces_buf, + self.barrier_buf, self.scatter_buf, self.reduce_buf, + self.max_seq_len, + self.max_tiles, + self.tp, + self.pp, + self.rocshmem_ctx) + + # for debug + # if self.group.rank() == 0: + # print(f"{self.group.rank()=} {out.shape=} {self.scatter_buf[:1,::384]} {out[:1, 0]}") + # print(f"{self.group.rank()=} {self.scatter_buf.shape=} {self.scatter_buf[:4,:384]=}") + # print(f"{self.group.rank()=} {self.scatter_buf.shape=} {self.scatter_buf[:4,384:]=}") + # return out + + # TODO: support more quant types here + # currently should not reach here + assert self.quant_method in self._SUPPORTED_QUANTIZATION_METHOD, ( + f"{self.quant_method=} currently not supported") + + def should_custom_gemm_allreduce(self, input: torch.tensor): + if self.disabled: + return False + if input.shape[0] > self.max_seq_len: + return False + return True + + \ No newline at end of file diff --git a/aiter/dist/device_communicators/base_device_communicator.py b/aiter/dist/device_communicators/base_device_communicator.py new file mode 100644 index 0000000000000000000000000000000000000000..a10d563429adb923f5138e35d5bda485a8bbbdd0 --- /dev/null +++ b/aiter/dist/device_communicators/base_device_communicator.py @@ -0,0 +1,297 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import threading +from weakref import WeakValueDictionary + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + + +class Cache: + def __init__(self): + self._cache: WeakValueDictionary = WeakValueDictionary() + self._lock = threading.RLock() # Reentrant lock for thread safety + + def get_or_create(self, kwargs, func): + # Create a hashable key from the kwargs + key = tuple(sorted((k, v) for k, v in kwargs.items())) + + with self._lock: + instance = self._cache.get(key) + if instance is None: + instance = func(**kwargs) + self._cache[key] = instance + return instance + + +class All2AllManagerBase: + rank: int + world_size: int + + def __init__(self, cpu_group): + self.cpu_group = cpu_group + + # compute some common properties + from aiter.dist.parallel_state import ( + get_dp_group, + get_tp_group, + in_the_same_node_as, + ) + + # all2all lives in ep group, which is merged from dp and tp group + self.dp_group = get_dp_group() + self.tp_group = get_tp_group() + + # no self.ep_group since self.ep_group is still in construction + # when we create this object + self.dp_rank = self.dp_group.rank_in_group + self.dp_world_size = self.dp_group.world_size + self.rank = dist.get_rank(cpu_group) + self.world_size = dist.get_world_size(cpu_group) + + # all2all communication often has separate implementations for + # intra-node and inter-node communication + self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0)) + + def get_handle(self, kwargs): + # get a handle for the all2all communication, + # based on the kwargs. + # different layers can have different configs, + # e.g. one layer has hidden size 1024, another has 2048. + # usually the underlying implementation caches the handle + # and reuse it for the same config. + raise NotImplementedError + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ): + raise NotImplementedError + + def set_num_sms(self, num_sms: int): + pass + + def max_sms_used(self) -> int | None: + return None # None means it could use the whole GPU + + def combine(self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False): + raise NotImplementedError + + def destroy(self): + pass + + +class DeviceCommunicatorBase: + """ + Base class for device-specific communicator. + It can use the `cpu_group` to initialize the communicator. + If the device has PyTorch integration (PyTorch can recognize its + communication backend), the `device_group` will also be given. + """ + + def __init__( + self, + cpu_group: ProcessGroup, + device: torch.device | None = None, + device_group: ProcessGroup | None = None, + unique_name: str = "", + ): + self.device = device or torch.device("cpu") + self.cpu_group = cpu_group + self.device_group = device_group + self.unique_name = unique_name + self.rank = dist.get_rank(cpu_group) + self.world_size = dist.get_world_size(cpu_group) + self.ranks = dist.get_process_group_ranks(cpu_group) + self.global_rank = dist.get_rank() + self.global_world_size = dist.get_world_size() + self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank) + + use_ep = False + all2all_backend = None + # from vllm.config import get_current_vllm_config + + # config = get_current_vllm_config() + # if config is not None: + # # as long as we use data parallel (coupled data parallel + # # where all data parallel ranks execute forward together), + # # we initialize the all2all manager used in expert parallel. + # use_ep = config.parallel_config.data_parallel_size > 1 + # all2all_backend = config.parallel_config.all2all_backend + + self.is_ep_communicator = "ep" in unique_name + self.use_all2all = self.is_ep_communicator and use_ep + self.all2all_backend = all2all_backend + self.all2all_manager: All2AllManagerBase | None = None + + def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: + dist.all_reduce(input_, group=self.device_group) + return input_ + + def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + input_size = input_.size() + # NOTE: we have to use concat-style all-gather here, + # stack-style all-gather has compatibility issues with + # torch.compile . see https://github.com/pytorch/pytorch/issues/138795 + output_size = (input_size[0] * self.world_size,) + input_size[1:] + # Allocate output tensor. + output_tensor = torch.empty( + output_size, dtype=input_.dtype, device=input_.device + ) + # All-gather. + dist.all_gather_into_tensor(output_tensor, input_, group=self.device_group) + # Reshape + output_tensor = output_tensor.reshape((self.world_size,) + input_size) + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape( + input_size[:dim] + + (self.world_size * input_size[dim],) + + input_size[dim + 1 :] + ) + return output_tensor + + def all_gatherv( + self, + input_: torch.Tensor | list[torch.Tensor], + dim: int = 0, + sizes: list[int] | None = None, + ) -> torch.Tensor | list[torch.Tensor]: + raise NotImplementedError + + def reduce_scatter(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert ( + -input_.dim() <= dim < input_.dim() + ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Note: This will produce an incorrect answer if we don't make + # the input_tensor contiguous. Possible bug in reduce_scatter_tensor? + input_tensor = input_.movedim(0, dim).contiguous() + + assert input_tensor.shape[0] % world_size == 0 + chunk_size = input_tensor.shape[0] // world_size + output_shape = (chunk_size,) + input_tensor.shape[1:] + + output_tensor = torch.empty( + output_shape, dtype=input_tensor.dtype, device=input_tensor.device + ) + + # Perform reduce-scatter operation + torch.distributed.reduce_scatter_tensor( + output_tensor, input_tensor, group=self.device_group + ) + + # Reshape before returning + return output_tensor.movedim(0, dim).contiguous() + + def reduce_scatterv( + self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None + ) -> torch.Tensor: + raise NotImplementedError + + def gather( + self, input_: torch.Tensor, dst: int = 0, dim: int = -1 + ) -> torch.Tensor | None: + """ + NOTE: We assume that the input tensor is on the same device across + all the ranks. + NOTE: `dst` is the local rank of the destination rank. + """ + world_size = self.world_size + assert ( + -input_.dim() <= dim < input_.dim() + ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Allocate output tensor. + if self.rank_in_group == dst: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + else: + gather_list = None + # Gather. + torch.distributed.gather( + input_, gather_list, dst=self.ranks[dst], group=self.device_group + ) + if self.rank_in_group == dst: + output_tensor = torch.cat(gather_list, dim=dim) + else: + output_tensor = None + return output_tensor + + def send(self, tensor: torch.Tensor, dst: int | None = None) -> None: + """Sends a tensor to the destination rank in a blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + torch.distributed.send(tensor, self.ranks[dst], self.device_group) + + def recv( + self, size: torch.Size, dtype: torch.dtype, src: int | None = None + ) -> torch.Tensor: + """Receives a tensor from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + if src is None: + src = (self.rank_in_group - 1) % self.world_size + + tensor = torch.empty(size, dtype=dtype, device=self.device) + torch.distributed.recv(tensor, self.ranks[src], self.device_group) + return tensor + + def destroy(self): + pass + + def prepare_communication_buffer_for_model(self, model: torch.nn.Module) -> None: + """ + Prepare the communication buffer for the model. + """ + if not self.is_ep_communicator: + return + + moe_modules = [ + module + for module in model.modules() + # TODO(bnell): Should use isinstance but can't. Maybe search for + # presence of quant_method.init_prepare_finalize? + if ( + module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE" + ) + ] + for module in moe_modules: + module.quant_method.init_prepare_finalize(module) + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Dispatch the hidden states and router logits to the appropriate device. + This is a no-op in the base class. + """ + return hidden_states, router_logits + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + """ + Combine the hidden states and router logits from the appropriate device. + This is a no-op in the base class. + """ + return hidden_states diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py new file mode 100644 index 0000000000000000000000000000000000000000..4c13965a9603608228fa8263c1c83c0ea6b28f2d --- /dev/null +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -0,0 +1,364 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +from torch.distributed import ProcessGroup + +should_nccl_symm_mem_allreduce = False +from aiter.dist.parallel_state import is_global_first_rank +from aiter import logger +from .base_device_communicator import DeviceCommunicatorBase + + +class CudaCommunicator(DeviceCommunicatorBase): + def __init__( + self, + cpu_group: ProcessGroup, + device: torch.device | None = None, + device_group: ProcessGroup | None = None, + unique_name: str = "", + ): + super().__init__(cpu_group, device, device_group, unique_name) + if "tp" not in unique_name: + # custom allreduce or torch symm mem can be used only by tp + use_custom_allreduce = False + use_torch_symm_mem = False + else: + from aiter.dist.parallel_state import _ENABLE_CUSTOM_ALL_REDUCE + + use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE + use_torch_symm_mem = False + + self.use_custom_allreduce = use_custom_allreduce + self.use_torch_symm_mem = use_torch_symm_mem + + # lazy import to avoid documentation build error + from aiter.dist.device_communicators.custom_all_reduce import ( + CustomAllreduce, + ) + + # from aiter.dist.device_communicators.symm_mem import SymmMemCommunicator + + self.pynccl_comm = None + if self.world_size > 1: + from aiter.dist.device_communicators.communicator_pynccl import ( + PyNcclCommunicator, + ) + + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, + device=self.device, + ) + # if is_symmetric_memory_enabled(): + # register_nccl_symmetric_ops(self.pynccl_comm) + + self.ca_comm: CustomAllreduce | None = None + self.qr_comm = None + self.symm_mem_comm = None + # if use_torch_symm_mem and current_platform.is_cuda(): + # self.symm_mem_comm = SymmMemCommunicator( + # group=self.cpu_group, + # device=self.device, + # ) + + if use_custom_allreduce and self.world_size > 1: + # Initialize a custom fast all-reduce implementation. + self.ca_comm = CustomAllreduce( + group=self.cpu_group, + device=self.device, + # symm_mem_enabled=( + # self.symm_mem_comm is not None and not self.symm_mem_comm.disabled + # ), + ) + + if self.world_size > 1: + from aiter.dist.device_communicators.quick_all_reduce import ( + QuickAllReduce, + ) + + # # Initialize a custom quick all-reduce implementation for hygon. + # # Quick reduce is designed as a complement to custom allreduce. + # # Based on quickreduce (https://github.com/mk1-project/quickreduce). + # # If it's a rocm, 'use_custom_allreduce==True' means it must + # # currently be an HYGON series. + self.qr_comm = QuickAllReduce(group=self.cpu_group, device=self.device) + + if self.use_all2all: + if self.all2all_backend == "naive": + from .all2all import NaiveAll2AllManager + + self.all2all_manager = NaiveAll2AllManager(self.cpu_group) + elif self.all2all_backend == "allgather_reducescatter": + from .all2all import AgRsAll2AllManager + + self.all2all_manager = AgRsAll2AllManager(self.cpu_group) + elif self.all2all_backend == "pplx": + from .all2all import PPLXAll2AllManager + + self.all2all_manager = PPLXAll2AllManager(self.cpu_group) + elif self.all2all_backend == "deepep_high_throughput": + from .all2all import DeepEPHTAll2AllManager + + self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group) + elif self.all2all_backend == "deepep_low_latency": + from .all2all import DeepEPLLAll2AllManager + + self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group) + elif self.all2all_backend == "flashinfer_all2allv": + from .all2all import FlashInferAllToAllManager + + self.all2all_manager = FlashInferAllToAllManager(self.cpu_group) + else: + raise ValueError(f"Unknown all2all backend: {self.all2all_backend}") + + if is_global_first_rank(): + logger.info( + "Using %s all2all manager.", + self.all2all_manager.__class__.__name__, + ) + + def all_reduce(self, input_, ca_fp8_quant: bool = False) -> torch.Tensor: + # always try quick reduce first, then custom allreduce, + # and then pynccl. (quick reduce just for ROCM MI3*) + qr_comm = self.qr_comm + if ( + qr_comm is not None + and not qr_comm.disabled + and qr_comm.should_quick_allreduce(input_) + ): + out = qr_comm.quick_all_reduce(input_) + assert out is not None + return out + + ca_comm = self.ca_comm + if ( + ca_comm is not None + and not ca_comm.disabled + and ca_comm.should_custom_ar(input_) + ): + out = ca_comm.custom_all_reduce(input_, ca_fp8_quant) + assert out is not None + return out + symm_mem_comm = self.symm_mem_comm + if symm_mem_comm is not None and symm_mem_comm.should_use_symm_mem(input_): + out = symm_mem_comm.all_reduce(input_) + assert out is not None + return out + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + out = pynccl_comm.all_reduce(input_) + assert out is not None + return out + # fall back to the default all-reduce using PyTorch. + # this usually happens during testing. + # when we run the model, allreduce only happens for the TP + # group, where we always have either custom allreduce or pynccl. + out = input_.clone() + torch.distributed.all_reduce(out, group=self.device_group) + return out + + def fused_allreduce_rmsnorm( + self, input_, res_inp_, weight_, eps + ) -> tuple[torch.Tensor, torch.Tensor]: + n = input_.shape[-1] + can_use_fuse_ar_rms = ( + n <= 16384 + and input_.numel() * input_.element_size() < 8 * 1024 * 8192 + and self.world_size != 6 + ) + ca_comm = self.ca_comm + if ( + ca_comm is not None + and not ca_comm.disabled + and ca_comm.should_custom_ar(input_) + and can_use_fuse_ar_rms + ): + res_out, out = ca_comm.custom_fused_ar_rms(input_, res_inp_, weight_, eps) + assert out is not None + assert res_out is not None + return res_out, out + # call split kernel + ar_out = self.all_reduce(input_) + out = torch.empty_like(ar_out) + residual_out = torch.empty_like(ar_out) + from aiter import rmsnorm2d_fwd_with_add + + rmsnorm2d_fwd_with_add( + out, + ar_out, + input_, + residual_out, + weight_, + eps, + 0, + ) + return residual_out, out + + def reduce_scatter(self, input_: torch.Tensor, dim: int = -1): + world_size = self.world_size + pynccl_comm = self.pynccl_comm + assert pynccl_comm is not None + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Note: This will produce an incorrect answer if we don't make + # the input_tensor contiguous. Possible bug in reduce_scatter_tensor? + input_tensor = input_.movedim(0, dim).contiguous() + + assert input_tensor.shape[0] % world_size == 0 + chunk_size = input_tensor.shape[0] // world_size + output_shape = (chunk_size,) + input_tensor.shape[1:] + + output = torch.empty( + output_shape, dtype=input_tensor.dtype, device=input_tensor.device + ) + + pynccl_comm.reduce_scatter(output, input_tensor) + + # Reshape before returning + return output.movedim(0, dim).contiguous() + + def reduce_scatterv( + self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None + ): + world_size = self.world_size + pynccl_comm = self.pynccl_comm + assert pynccl_comm is not None + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Note: This will produce an incorrect answer if we don't make + # the input_tensor contiguous. Possible bug in reduce_scatter_tensor? + input_tensor = input_.movedim(0, dim).contiguous() + + if sizes is not None: + assert len(sizes) == world_size + assert input_tensor.shape[0] == sum(sizes) + chunk_size = sizes[self.rank_in_group] + else: + assert input_tensor.shape[0] % world_size == 0 + chunk_size = input_tensor.shape[0] // world_size + output_shape = (chunk_size,) + input_tensor.shape[1:] + + output = torch.empty( + output_shape, dtype=input_tensor.dtype, device=input_tensor.device + ) + + if sizes is not None: + pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes) + else: + pynccl_comm.reduce_scatter(output, input_tensor) + + # Reshape before returning + return output.movedim(0, dim).contiguous() + + def send(self, tensor: torch.Tensor, dst: int | None = None) -> None: + """Sends a tensor to the destination rank in a blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.send(tensor, dst) + else: + torch.distributed.send(tensor, self.ranks[dst], self.device_group) + + def recv( + self, size: torch.Size, dtype: torch.dtype, src: int | None = None + ) -> torch.Tensor: + """Receives a tensor from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + if src is None: + src = (self.rank_in_group - 1) % self.world_size + + tensor = torch.empty(size, dtype=dtype, device=self.device) + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.recv(tensor, src) + else: + torch.distributed.recv(tensor, self.ranks[src], self.device_group) + return tensor + + def destroy(self): + if self.pynccl_comm is not None: + self.pynccl_comm = None + if self.qr_comm is not None: + self.qr_comm = None + if self.ca_comm is not None: + self.ca_comm = None + if self.all2all_manager is not None: + self.all2all_manager.destroy() + self.all2all_manager = None + + def all_gatherv( + self, + input_: torch.Tensor | list[torch.Tensor], + dim: int = 0, + sizes: list[int] | None = None, + ): + if dim != 0: + raise NotImplementedError("only dim 0 all-gatherv is supported") + world_size = self.world_size + pynccl_comm = self.pynccl_comm + assert pynccl_comm is not None and not pynccl_comm.disabled + + # 'sizes' is not needed if all inputs in the same group have the same + # shape + if sizes is not None and all(s == sizes[0] for s in sizes): + sizes = None + + def _all_gather_single(input_: torch.Tensor, sizes: list[int] | None = None): + input_size = input_.size() + if sizes is not None: + assert len(sizes) == world_size + assert ( + input_.shape[dim] == sizes[self.rank_in_group] + ), f"{input_.shape[dim]} != {sizes[self.rank_in_group]}" + output_size = (sum(sizes),) + input_size[1:] + else: + output_size = (input_size[0] * world_size,) + input_size[1:] + # Allocate output tensor. + output_tensor = torch.empty( + output_size, dtype=input_.dtype, device=input_.device + ) + if sizes is not None: + pynccl_comm.all_gatherv(output_tensor, input_, sizes=sizes) + else: + pynccl_comm.all_gather(output_tensor, input_) + return output_tensor + + if isinstance(input_, torch.Tensor): + return _all_gather_single(input_, sizes) + + output_list = [] + pynccl_comm.group_start() + for inp in input_: + output_list.append(_all_gather_single(inp, sizes=sizes)) + pynccl_comm.group_end() + + return output_list + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + assert self.all2all_manager is not None + hidden_states, router_logits = self.all2all_manager.dispatch( + hidden_states, router_logits, is_sequence_parallel + ) + return hidden_states, router_logits + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + assert self.all2all_manager is not None + hidden_states = self.all2all_manager.combine( + hidden_states, is_sequence_parallel + ) + return hidden_states diff --git a/aiter/dist/device_communicators/communicator_pynccl.py b/aiter/dist/device_communicators/communicator_pynccl.py new file mode 100644 index 0000000000000000000000000000000000000000..c0bfaf358de6faea07a1f3d65139fc8b0b05cbe4 --- /dev/null +++ b/aiter/dist/device_communicators/communicator_pynccl.py @@ -0,0 +1,381 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +# ===================== import region ===================== +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup, ReduceOp + +from .pynccl_wrapper import ( + NCCLLibrary, + buffer_type, + cudaStream_t, + ncclComm_t, + ncclDataTypeEnum, + ncclRedOpTypeEnum, + ncclUniqueId, +) +from aiter import logger + +current_stream = torch.cuda.current_stream +_NCCL_SYMM_OPS_REGISTERED = False + + +# def register_nccl_symmetric_ops(pynccl_comm): +# from vllm.distributed.device_communicators.pynccl_allocator import ( +# nccl_symm_mem_context, +# ) +# from vllm.utils.torch_utils import direct_register_custom_op + +# global _NCCL_SYMM_OPS_REGISTERED +# if _NCCL_SYMM_OPS_REGISTERED: +# return +# _NCCL_SYMM_OPS_REGISTERED = True + +# def all_reduce_symmetric_with_copy_impl(input_tensor: torch.Tensor) -> torch.Tensor: +# with nccl_symm_mem_context(pynccl_comm): +# symm_input = torch.empty_like(input_tensor) +# symm_output = torch.empty_like(input_tensor) +# symm_input.copy_(input_tensor) +# symm_output = pynccl_comm.all_reduce(symm_input, symm_output) +# return symm_output + +# def all_reduce_symmetric_with_copy_fake(input_tensor: torch.Tensor) -> torch.Tensor: +# return torch.empty_like(input_tensor) + +# direct_register_custom_op( +# op_name="all_reduce_symmetric_with_copy", +# op_func=all_reduce_symmetric_with_copy_impl, +# fake_impl=all_reduce_symmetric_with_copy_fake, +# ) + + +class PyNcclCommunicator: + def __init__( + self, + group: ProcessGroup, + device: int | str | torch.device, + library_path: str | None = None, + ): + """ + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the PyNcclCommunicator to. If None, + it will be bound to f"cuda:{local_rank}". + library_path: the path to the NCCL library. If None, it will + use the default library path. + It is the caller's responsibility to make sure each communicator + is bind to a unique device. + """ + if isinstance(group, ProcessGroup): + assert dist.is_initialized() + assert ( + dist.get_backend(group) != dist.Backend.NCCL + ), "PyNcclCommunicator should be attached to a non-NCCL group." + # note: this rank is the rank in the group + self.rank = dist.get_rank(group) + self.world_size = dist.get_world_size(group) + else: + self.rank = group.rank + self.world_size = group.world_size + + self.group = group + + # if world_size == 1, no need to create communicator + if self.world_size == 1: + self.available = False + self.disabled = True + return + try: + self.nccl = NCCLLibrary(library_path) + except Exception as e: + print(f"Failed to load NCCL library: {e}") + # disable because of missing NCCL library + # e.g. in a non-GPU environment + self.available = False + self.disabled = True + return + + self.available = True + self.disabled = False + + self.nccl_version = self.nccl.ncclGetRawVersion() + if self.rank == 0: + # get the unique id from NCCL + self.unique_id = self.nccl.ncclGetUniqueId() + logger.info(f"load NCCL version: {self.nccl_version}") + else: + # construct an empty unique id + self.unique_id = ncclUniqueId() + + if isinstance(group, ProcessGroup): + tensor = torch.ByteTensor(list(self.unique_id.internal)) + ranks = dist.get_process_group_ranks(group) + # arg `src` in `broadcast` is the global rank + dist.broadcast(tensor, src=ranks[0], group=group) + byte_list = tensor.tolist() + for i, byte in enumerate(byte_list): + self.unique_id.internal[i] = byte + else: + self.unique_id = group.broadcast_obj(self.unique_id, src=0) + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + # now `device` is a `torch.device` object + assert isinstance(device, torch.device) + self.device = device + # nccl communicator and stream will use this device + # `torch.cuda.device` is a context manager that changes the + # current cuda device to the specified one + with torch.cuda.device(device): + self.comm: ncclComm_t = self.nccl.ncclCommInitRank( + self.world_size, self.unique_id, self.rank + ) + + stream = current_stream() + # A small all_reduce for warmup. + data = torch.zeros(1, device=device) + self.all_reduce(data) + stream.synchronize() + del data + + def all_reduce( + self, + in_tensor: torch.Tensor, + out_tensor: torch.Tensor = None, + op: ReduceOp = ReduceOp.SUM, + stream=None, + ) -> torch.Tensor: + if self.disabled: + return None + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert in_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {in_tensor.device}" + ) + + if out_tensor is None: + out_tensor = torch.empty_like(in_tensor) + + if stream is None: + stream = current_stream() + self.nccl.ncclAllReduce( + buffer_type(in_tensor.data_ptr()), + buffer_type(out_tensor.data_ptr()), + in_tensor.numel(), + ncclDataTypeEnum.from_torch(in_tensor.dtype), + ncclRedOpTypeEnum.from_torch(op), + self.comm, + cudaStream_t(stream.cuda_stream), + ) + return out_tensor + + def all_gather( + self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, stream=None + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclAllGather( + buffer_type(input_tensor.data_ptr()), + buffer_type(output_tensor.data_ptr()), + input_tensor.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def all_gatherv( + self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + sizes: list[int], + stream=None, + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + assert output_tensor.shape[0] == sum(sizes) + split_offset = 0 + self.nccl.ncclGroupStart() + for root, split_size in enumerate(sizes): + dst_slice = output_tensor[split_offset : split_offset + split_size] + self.nccl.ncclBroadcast( + buffer_type(input_tensor.data_ptr()), + buffer_type(dst_slice.data_ptr()), + dst_slice.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + root, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + split_offset += split_size + self.nccl.ncclGroupEnd() + + def reduce_scatter( + self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + op: ReduceOp = ReduceOp.SUM, + stream=None, + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclReduceScatter( + buffer_type(input_tensor.data_ptr()), + buffer_type(output_tensor.data_ptr()), + output_tensor.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + ncclRedOpTypeEnum.from_torch(op), + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def reduce_scatterv( + self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + sizes: list[int], + op: ReduceOp = ReduceOp.SUM, + stream=None, + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + + split_offset = 0 + self.nccl.ncclGroupStart() + for root, split_size in enumerate(sizes): + chunk = input_tensor[split_offset : split_offset + split_size, ...] + self.nccl.ncclReduce( + buffer_type(chunk.data_ptr()), + buffer_type(output_tensor.data_ptr()), + chunk.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + ncclRedOpTypeEnum.from_torch(op), + root, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + split_offset += split_size + self.nccl.ncclGroupEnd() + + def send(self, tensor: torch.Tensor, dst: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclSend( + buffer_type(tensor.data_ptr()), + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + dst, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def recv(self, tensor: torch.Tensor, src: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclRecv( + buffer_type(tensor.data_ptr()), + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + src, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def broadcast(self, tensor: torch.Tensor, src: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + if src == self.rank: + sendbuff = buffer_type(tensor.data_ptr()) + # NCCL requires the sender also to have a receive buffer + recvbuff = buffer_type(tensor.data_ptr()) + else: + sendbuff = buffer_type() + recvbuff = buffer_type(tensor.data_ptr()) + self.nccl.ncclBroadcast( + sendbuff, + recvbuff, + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + src, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def group_start(self): + self.nccl.ncclGroupStart() + + def group_end(self): + self.nccl.ncclGroupEnd() + + def register_comm_window(self, tensor: torch.Tensor): + return self.nccl.ncclCommWindowRegister( + self.comm, + buffer_type(tensor.data_ptr()), + tensor.numel() * tensor.element_size(), + 1, + ) + + def register_comm_window_raw(self, ptr: int, size: int): + return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1) + + def deregister_comm_window(self, window): + return self.nccl.ncclCommWindowDeregister(self.comm, window) diff --git a/aiter/dist/device_communicators/custom_all_reduce.py b/aiter/dist/device_communicators/custom_all_reduce.py new file mode 100644 index 0000000000000000000000000000000000000000..697983ef4ecd014a1fd036424b9c8df16425356e --- /dev/null +++ b/aiter/dist/device_communicators/custom_all_reduce.py @@ -0,0 +1,393 @@ +""" +* Copyright (C) 2024-2025, The vLLM team. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +""" + +from contextlib import contextmanager +from typing import Any, List, Optional, Union + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +# import vllm.envs as envs +# from vllm import _custom_ops as ops +import aiter as ops +from aiter.dist.parallel_state import in_the_same_node_as +from aiter import logger + +try: + ops.meta_size() + custom_ar = True +except Exception as e: + # For CPUs + custom_ar = False + logger.warning(f"Custom allreduce is disabled: {e}") + + +def is_weak_contiguous(inp: torch.Tensor): + return inp.is_contiguous() or ( + inp.storage().nbytes() - inp.storage_offset() * inp.element_size() + == inp.numel() * inp.element_size() + ) + + +class CustomAllreduce: + + _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8] + + # max_size: max supported allreduce size + def __init__( + self, + group: ProcessGroup, + device: Union[int, str, torch.device], + max_size=8192 * 1024 * 8, + ) -> None: + """ + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the CustomAllreduce to. If None, + it will be bind to f"cuda:{local_rank}". + It is the caller's responsibility to make sure each communicator + is bind to a unique device, and all communicators in this group + are in the same node. + """ + self._IS_CAPTURING = False + self.disabled = True + + if not custom_ar: + # disable because of missing custom allreduce library + # e.g. in a non-cuda environment + return + + self.group = group + + assert ( + dist.get_backend(group) != dist.Backend.NCCL + ), "CustomAllreduce should be attached to a non-NCCL group." + + if not all(in_the_same_node_as(group, source_rank=0)): + # No need to initialize custom allreduce for multi-node case. + logger.warning( + "Custom allreduce is disabled because this process group" + " spans across nodes." + ) + return + + rank = dist.get_rank(group=self.group) + world_size = dist.get_world_size(group=self.group) + if world_size == 1: + # No need to initialize custom allreduce for single GPU case. + return + + if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES: + logger.warning( + "Custom allreduce is disabled due to an unsupported world" + " size: %d. Supported world sizes: %s. To silence this " + "warning, specify disable_custom_all_reduce=True explicitly.", + world_size, + str(CustomAllreduce._SUPPORTED_WORLD_SIZES), + ) + return + + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + # now `device` is a `torch.device` object + assert isinstance(device, torch.device) + self.device = device + + # device_ids = get_cuda_visible_devices() + + # physical_device_id = device_ids[device.index] + # tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu") + # gather_list = [ + # torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size) + # ] + # dist.all_gather(gather_list, tensor, group=self.group) + # physical_device_ids = [t.item() for t in gather_list] + + # test nvlink first, this will filter out most of the cases + # where custom allreduce is not supported + # this checks hardware and driver support for NVLink + # assert current_platform.is_cuda() or current_platform.is_rocm() + # fully_connected = current_platform.is_full_nvlink(physical_device_ids) + fully_connected = True + if world_size > 2 and not fully_connected: + logger.warning( + "Custom allreduce is disabled because it's not supported on" + " more than two PCIe-only GPUs. To silence this warning, " + "specify disable_custom_all_reduce=True explicitly." + ) + return + # test P2P capability, this checks software/cudaruntime support + # this is expensive to compute at the first time + # then we cache the result + # On hygon GPU, p2p is always enabled between XGMI connected GPUs + # if not current_platform.is_rocm() and not _can_p2p(rank, world_size): + # logger.warning( + # "Custom allreduce is disabled because your platform lacks " + # "GPU P2P capability or P2P test failed. To silence this " + # "warning, specify disable_custom_all_reduce=True explicitly.") + # return + + self.disabled = False + # buffers memory are owned by this Python class and passed to C++ + # meta data composes of two parts: meta data for synchronization + # (256 bytes) and a temporary buffer for storing intermediate + # allreduce results. + # if current_platform.is_rocm(): + self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size) + # This is a pre-registered IPC buffer. In eager mode, input tensors + # are first copied into this buffer before allreduce is performed + self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device) + # This is a buffer for storing the tuples of pointers pointing to + # IPC buffers from all ranks. Each registered tuple has size of + # 8*world_size bytes where world_size is at most 8. Allocating 8MB + # is enough for 131072 such tuples. The largest model I've seen only + # needs less than 10000 of registered tuples. + self.rank_data = torch.empty( + 8 * 1024 * 1024, dtype=torch.uint8, device=self.device + ) + self.max_size = max_size + self.rank = rank + self.world_size = world_size + handle = ops.get_meta_buffer_ipc_handle(self.meta) + shard_data = ( + handle, # ipc handle to base ptr + 0, # offset of base ptr + ) + handles, offsets = self._gather_ipc_meta(shard_data) + + self.fully_connected = fully_connected + self._ptr = ops.init_custom_ar( + self.meta, self.rank_data, handles, offsets, rank, self.fully_connected + ) + self.register_buffer(self.buffer) + + @contextmanager + def capture(self): + """ + The main responsibility of this context manager is the + `register_graph_buffers` call at the end of the context. + It records all the buffer addresses used in the CUDA graph. + """ + try: + self._IS_CAPTURING = True + yield + finally: + self._IS_CAPTURING = False + if not self.disabled: + self.register_graph_buffers() + + def _get_ipc_meta(self, inp: torch.Tensor): + # if current_platform.is_rocm(): + if 1: + # _share_cuda_() doesn't accept meta buffer not allocated from + # PyTorch cache allocator, use direct HIP call to get IPC handle + handle = ops.get_meta_buffer_ipc_handle(inp) + shard_data = ( + handle, # ipc handle to base ptr + 0, # offset of base ptr + ) + else: + data = inp.untyped_storage()._share_cuda_() + shard_data = ( + data[1], # ipc handle to base ptr + data[3], # offset of base ptr + ) + return self._gather_ipc_meta(shard_data) + + def _gather_ipc_meta(self, shard_data): + # Note: don't use `[[None]] * self.world_size` here + # because it will create a list of the same reference + all_data: List[Optional[Any]] = [[None] for i in range(self.world_size)] + all_data[self.rank][0] = shard_data + + ranks = dist.get_process_group_ranks(group=self.group) + ranks.sort() + for i, rank in enumerate(ranks): + dist.broadcast_object_list( + all_data[i], src=rank, group=self.group, device="cpu" + ) + + # we cannot directly use `dist.all_gather_object` here + # because it is incompatible with `gloo` backend under inference mode. + # see https://github.com/pytorch/pytorch/issues/126032 for details. + + handles = [] + offsets = [] + for i in range(len(all_data)): + handles.append(all_data[i][0][0]) # type: ignore + offsets.append(all_data[i][0][1]) # type: ignore + return handles, offsets + + def register_buffer(self, inp: torch.Tensor): + handles, offsets = self._get_ipc_meta(inp) + ops.register_buffer(self._ptr, inp, handles, offsets) + + def register_graph_buffers(self): + handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) + handles, offsets = self._gather_ipc_meta((handle, offset)) + logger.info("Registering %d cuda graph addresses", len(offset)) + ops.register_graph_buffers(self._ptr, handles, offsets) + + def should_custom_ar(self, inp: torch.Tensor): + if self.disabled: + return False + inp_size = inp.numel() * inp.element_size() + # custom allreduce requires input byte size to be multiples of 16 + if inp_size % 16 != 0: + return False + if not is_weak_contiguous(inp): + return False + # for 4 or more non NVLink-capable GPUs, custom allreduce provides + # little performance improvement over NCCL. + if self.world_size == 2 or self.fully_connected: + return inp_size <= self.max_size + return False + + def all_reduce( + self, + inp: torch.Tensor, + *, + out: Optional[torch.Tensor] = None, + open_fp8_quant: bool = False, + registered: bool = False, + ): + """Performs an out-of-place all reduce. + + If registered is True, this assumes inp's pointer is already + IPC-registered. Otherwise, inp is first copied into a pre-registered + buffer. + """ + if out is None: + out = torch.empty_like(inp) + ops.all_reduce( + self._ptr, + inp, + out, + open_fp8_quant, + None if registered else self.buffer, + ) + return out + + def custom_all_reduce( + self, input: torch.Tensor, open_fp8_quant: bool = False + ) -> Optional[torch.Tensor]: + # when custom allreduce is disabled, this will be None + if self.disabled or not self.should_custom_ar(input): + return None + if self._IS_CAPTURING: + if torch.cuda.is_current_stream_capturing(): + return self.all_reduce( + input, open_fp8_quant=open_fp8_quant, registered=True + ) + else: + # if warm up, mimic the allocation pattern + # since custom allreduce is out-of-place + return torch.zeros_like(input) + else: + # note: outside of cuda graph context, + # custom allreduce incurs a cost of cudaMemcpy, which should + # be small(<=1% of overall latency) compared to the performance + # gains of using custom kernels + return self.all_reduce( + input, open_fp8_quant=open_fp8_quant, registered=False + ) + + def all_gather_reg(self, inp: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty( + inp.numel() * self.world_size, dtype=inp.dtype, device=inp.device + ) + ops.all_gather_reg(self._ptr, inp, out) + return out + + def all_gather_unreg(self, inp: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty( + inp.numel() * self.world_size, dtype=inp.dtype, device=inp.device + ) + ops.all_gather_unreg(self._ptr, inp, self.buffer, out) + return out + + def custom_all_gather(self, inp: torch.Tensor) -> Optional[torch.Tensor]: + if self._IS_CAPTURING: + if torch.cuda.is_current_stream_capturing(): + return self.all_gather_reg(inp) + else: + print("allgather capture hipgraph error") + return torch.zeros_like(inp) + else: + return self.all_gather_unreg(inp) + + def fused_ar_rms( + self, + inp: torch.Tensor, + res_inp: torch.Tensor, + *, + res_out: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + w: torch.Tensor, + eps: float, + registered: bool = False, + ): + if out is None: + out = torch.empty_like(inp) + if res_out is None: + res_out = torch.empty_like(inp) + ops.fused_allreduce_rmsnorm( + self._ptr, + inp, + res_inp, + res_out, + out, + w, + eps, + None if registered else self.buffer, + ) + return res_out, out + + def custom_fused_ar_rms( + self, + input: torch.Tensor, + residual_inp: torch.Tensor, + weight: torch.Tensor, + eps: float, + ) -> Optional[torch.Tensor]: + # when custom allreduce is disabled, this will be None + if self.disabled or not self.should_custom_ar(input): + return None + if self._IS_CAPTURING: + if torch.cuda.is_current_stream_capturing(): + return self.fused_ar_rms( + input, residual_inp, w=weight, eps=eps, registered=True + ) + else: + return torch.zeros_like(input), torch.zeros_like(input) + else: + return self.fused_ar_rms( + input, residual_inp, w=weight, eps=eps, registered=False + ) + + def close(self): + if not self.disabled and self._ptr: + ops.dispose(self._ptr) + self._ptr = 0 + + def __del__(self): + self.close() diff --git a/aiter/dist/device_communicators/pynccl_wrapper.py b/aiter/dist/device_communicators/pynccl_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..03d757e4ce57392f7f08389f94db2bf097d63ac9 --- /dev/null +++ b/aiter/dist/device_communicators/pynccl_wrapper.py @@ -0,0 +1,555 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# This file is a pure Python wrapper for the NCCL library. +# The main purpose is to use NCCL combined with CUDA graph. +# Before writing this script, we tried the following approach: +# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself +# often gets stuck when initializing the NCCL communicator. +# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce` +# contains many other potential cuda APIs, that are not allowed during +# capturing the CUDA graph. For further details, please check +# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ . +# +# Another rejected idea is to write a C/C++ binding for NCCL. It is usually +# doable, but we often encounter issues related with nccl versions, and need +# to switch between different versions of NCCL. +# A C/C++ binding is not flexible enough to handle this. It requires +# recompilation of the code every time we want to switch between different +# versions. This current implementation, with a **pure** Python wrapper, is +# more flexible. We can easily switch between different versions of NCCL by +# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file` +# variable in the code. + +import ctypes +import platform +from dataclasses import dataclass +from typing import Any + +import torch +from torch.distributed import ReduceOp +from aiter import logger + + +# === export types and functions from nccl to Python === +# for the original nccl definition, please check + +ncclResult_t = ctypes.c_int +ncclComm_t = ctypes.c_void_p +ncclWindow_t = ctypes.c_void_p + + +class ncclUniqueId(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 128)] + + +cudaStream_t = ctypes.c_void_p +buffer_type = ctypes.c_void_p + +ncclDataType_t = ctypes.c_int + + +class ncclDataTypeEnum: + ncclInt8 = 0 + ncclChar = 0 + ncclUint8 = 1 + ncclInt32 = 2 + ncclInt = 2 + ncclUint32 = 3 + ncclInt64 = 4 + ncclUint64 = 5 + ncclFloat16 = 6 + ncclHalf = 6 + ncclFloat32 = 7 + ncclFloat = 7 + ncclFloat64 = 8 + ncclDouble = 8 + ncclBfloat16 = 9 + ncclNumTypes = 10 + + @classmethod + def from_torch(cls, dtype: torch.dtype) -> int: + if dtype == torch.int8: + return cls.ncclInt8 + if dtype == torch.uint8: + return cls.ncclUint8 + if dtype == torch.int32: + return cls.ncclInt32 + if dtype == torch.int64: + return cls.ncclInt64 + if dtype == torch.float16: + return cls.ncclFloat16 + if dtype == torch.float32: + return cls.ncclFloat32 + if dtype == torch.float64: + return cls.ncclFloat64 + if dtype == torch.bfloat16: + return cls.ncclBfloat16 + raise ValueError(f"Unsupported dtype: {dtype}") + + +ncclRedOp_t = ctypes.c_int + + +class ncclRedOpTypeEnum: + ncclSum = 0 + ncclProd = 1 + ncclMax = 2 + ncclMin = 3 + ncclAvg = 4 + ncclNumOps = 5 + + @classmethod + def from_torch(cls, op: ReduceOp) -> int: + if op == ReduceOp.SUM: + return cls.ncclSum + if op == ReduceOp.PRODUCT: + return cls.ncclProd + if op == ReduceOp.MAX: + return cls.ncclMax + if op == ReduceOp.MIN: + return cls.ncclMin + if op == ReduceOp.AVG: + return cls.ncclAvg + raise ValueError(f"Unsupported op: {op}") + + +@dataclass +class Function: + name: str + restype: Any + argtypes: list[Any] + + +class NCCLLibrary: + exported_functions = [ + # const char* ncclGetErrorString(ncclResult_t result) + Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]), + # ncclResult_t ncclGetVersion(int *version); + Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]), + # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); + Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]), + # ncclResult_t ncclCommInitRank( + # ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); + # note that ncclComm_t is a pointer type, so the first argument + # is a pointer to a pointer + Function( + "ncclCommInitRank", + ncclResult_t, + [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int], + ), + # ncclResult_t ncclAllReduce( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclAllReduce", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclRedOp_t, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclReduce( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclRedOp_t op, int root, + # ncclComm_t comm, cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclReduce", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclRedOp_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclAllGather( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclAllGather", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclReduceScatter( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclReduceScatter", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclRedOp_t, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclSend( + # const void* sendbuff, size_t count, ncclDataType_t datatype, + # int dest, ncclComm_t comm, cudaStream_t stream); + Function( + "ncclSend", + ncclResult_t, + [ + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclRecv( + # void* recvbuff, size_t count, ncclDataType_t datatype, + # int src, ncclComm_t comm, cudaStream_t stream); + Function( + "ncclRecv", + ncclResult_t, + [ + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclBroadcast( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, int root, ncclComm_t comm, + # cudaStream_t stream); + Function( + "ncclBroadcast", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # be cautious! this is a collective call, it will block until all + # processes in the communicator have called this function. + # because Python object destruction can happen in random order, + # it is better not to call it at all. + # ncclResult_t ncclCommDestroy(ncclComm_t comm); + Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]), + # ncclResult_t ncclGroupStart(); + Function("ncclGroupStart", ncclResult_t, []), + # ncclResult_t ncclGroupEnd(); + Function("ncclGroupEnd", ncclResult_t, []), + # ncclResult_t ncclCommWindowRegister( + # ncclComm_t comm, void* buff, size_t size, + # ncclWindow_t* win, int winFlags); + # Function( + # "ncclCommWindowRegister", + # ncclResult_t, + # [ + # ncclComm_t, + # buffer_type, + # ctypes.c_size_t, + # ctypes.POINTER(ncclWindow_t), + # ctypes.c_int, + # ], + # ), + # # ncclResult_t ncclCommWindowDeregister( + # # ncclComm_t comm, ncclWindow_t win); + # Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]), + ] + + # class attribute to store the mapping from the path to the library + # to avoid loading the same library multiple times + path_to_library_cache: dict[str, Any] = {} + + # class attribute to store the mapping from library path + # to the corresponding dictionary + path_to_dict_mapping: dict[str, dict[str, Any]] = {} + + def __init__(self, so_file: str | None = None): + so_file = so_file or "librccl.so.1" + + try: + if so_file not in NCCLLibrary.path_to_dict_mapping: + lib = ctypes.CDLL(so_file) + NCCLLibrary.path_to_library_cache[so_file] = lib + self.lib = NCCLLibrary.path_to_library_cache[so_file] + except Exception as e: + logger.error( + "Failed to load NCCL library from %s. " + "It is expected if you are not running on Hygon GPUs." + "Otherwise, the nccl library might not exist, be corrupted " + "or it does not support the current platform %s. " + "If you already have the library, please set the " + "environment variable VLLM_NCCL_SO_PATH" + " to point to the correct nccl library path.", + so_file, + platform.platform(), + ) + raise e + + if so_file not in NCCLLibrary.path_to_dict_mapping: + _funcs: dict[str, Any] = {} + for func in NCCLLibrary.exported_functions: + try: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + except AttributeError: + if func.name in [ + "ncclCommWindowRegister", + "ncclCommWindowDeregister", + ]: + logger.warning( + "The symbol %s is not found in the NCCL " + "library %s. To enable VLLM_USE_NCCL_SYMM_MEM " + " please update your NCCL version to >= " + "2.27.03.", + func.name, + so_file, + ) + # Having an exception here on ROCm platform is + # not allowed during graph capturing + continue + raise + NCCLLibrary.path_to_dict_mapping[so_file] = _funcs + self._funcs = NCCLLibrary.path_to_dict_mapping[so_file] + + def ncclGetErrorString(self, result: ncclResult_t) -> str: + return self._funcs["ncclGetErrorString"](result).decode("utf-8") + + def NCCL_CHECK(self, result: ncclResult_t) -> None: + if result != 0: + error_str = self.ncclGetErrorString(result) + raise RuntimeError(f"NCCL error: {error_str}") + + def ncclGetRawVersion(self) -> int: + version = ctypes.c_int() + self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version))) + # something like 21903 + return version.value + + def ncclGetVersion(self) -> str: + version_str = str(self.ncclGetRawVersion()) + # something like 21903 --> "2.19.3" + major = version_str[0].lstrip("0") + minor = version_str[1:3].lstrip("0") + patch = version_str[3:].lstrip("0") + return f"{major}.{minor}.{patch}" + + def ncclGetUniqueId(self) -> ncclUniqueId: + unique_id = ncclUniqueId() + self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id))) + return unique_id + + def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: + if len(data) != 128: + raise ValueError( + f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes" + ) + unique_id = ncclUniqueId() + ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128) + return unique_id + + def ncclCommInitRank( + self, world_size: int, unique_id: ncclUniqueId, rank: int + ) -> ncclComm_t: + comm = ncclComm_t() + self.NCCL_CHECK( + self._funcs["ncclCommInitRank"]( + ctypes.byref(comm), world_size, unique_id, rank + ) + ) + return comm + + def ncclAllReduce( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + op: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # and `op` should be `ncclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclAllReduce"]( + sendbuff, recvbuff, count, datatype, op, comm, stream + ) + ) + + def ncclReduce( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + op: int, + root: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # and `op` should be `ncclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclReduce"]( + sendbuff, recvbuff, count, datatype, op, root, comm, stream + ) + ) + + def ncclReduceScatter( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + op: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # and `op` should be `ncclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclReduceScatter"]( + sendbuff, recvbuff, count, datatype, op, comm, stream + ) + ) + + def ncclAllGather( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # which is an aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclAllGather"]( + sendbuff, recvbuff, count, datatype, comm, stream + ) + ) + + def ncclSend( + self, + sendbuff: buffer_type, + count: int, + datatype: int, + dest: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + self.NCCL_CHECK( + self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream) + ) + + def ncclRecv( + self, + recvbuff: buffer_type, + count: int, + datatype: int, + src: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + self.NCCL_CHECK( + self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream) + ) + + def ncclBroadcast( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + root: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + self.NCCL_CHECK( + self._funcs["ncclBroadcast"]( + sendbuff, recvbuff, count, datatype, root, comm, stream + ) + ) + + def ncclCommDestroy(self, comm: ncclComm_t) -> None: + self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm)) + + def ncclGroupStart(self) -> None: + self.NCCL_CHECK(self._funcs["ncclGroupStart"]()) + + def ncclGroupEnd(self) -> None: + self.NCCL_CHECK(self._funcs["ncclGroupEnd"]()) + + def ncclCommWindowRegister( + self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int + ) -> ncclWindow_t: + window = ncclWindow_t() + self.NCCL_CHECK( + self._funcs["ncclCommWindowRegister"]( + comm, buff, size, ctypes.byref(window), win_flags + ) + ) + return window + + def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None: + self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window)) + + +__all__ = [ + "NCCLLibrary", + "ncclDataTypeEnum", + "ncclRedOpTypeEnum", + "ncclUniqueId", + "ncclComm_t", + "cudaStream_t", + "buffer_type", +] diff --git a/aiter/dist/device_communicators/quick_all_reduce.py b/aiter/dist/device_communicators/quick_all_reduce.py new file mode 100644 index 0000000000000000000000000000000000000000..d3ed89e66efc8f22b1649f4c39f3e77c542970ad --- /dev/null +++ b/aiter/dist/device_communicators/quick_all_reduce.py @@ -0,0 +1,266 @@ + +import logging +import os +from enum import Enum +from typing import Union + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +import aiter as ops +from ..parallel_state import in_the_same_node_as +from aiter import logger + +logger = logging.getLogger(__name__) + + +class QuickReduceRegime(Enum): + FP = 0 + FP8 = 1 + INT6 = 2 + INT4 = 3 + NONE = 4 + + +try: + quick_ar = False + regime_str = os.environ.get("AITER_QUICK_REDUCE_QUANTIZATION", None) + if regime_str in QuickReduceRegime.__members__: + ops.qr_max_size() + quick_ar = True +except Exception: + # For CPUs and CUDA + quick_ar = False + + +def qr_rocm_arch_available(): + try: + props = torch.cuda.get_device_properties(0) + gcn_arch = getattr(props, "gcnArchName", "") + supported_archs = ["gfx94", "gfx95"] + return any(gfx in gcn_arch for gfx in supported_archs) + except Exception as e: + logger.warning("Failed to determine ROCm for quick allreduce: %s", e) + return False + + +def is_weak_contiguous(inp: torch.Tensor): + return inp.is_contiguous() or ( + inp.storage().nbytes() - inp.storage_offset() * inp.element_size() + == inp.numel() * inp.element_size() + ) + + +MB = 1024 * 1024 + + +class QuickAllReduce: + + _SUPPORTED_WORLD_SIZES = [2, 4, 8] + _SUPPORTED_DTYPES = [torch.float16, torch.bfloat16] + # The following data is based on kernel tests. + # In this order [FP, FP8, INT6, INT4]. + _QR_MIN_SIZE = { + (torch.float16, 2): [1 * MB, 2 * MB, 2 * MB, 1 * MB], + (torch.float16, 4): [1 * MB, 16 * MB, 4 * MB, 2 * MB], + (torch.float16, 8): [16 * MB, 4 * MB, 4 * MB, 2 * MB], + (torch.bfloat16, 2): [2 * MB, 8 * MB, 8 * MB, 8 * MB], + (torch.bfloat16, 4): [8 * MB, 64 * MB, 64 * MB, 16 * MB], + (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB], + } + + def __init__( + self, group: ProcessGroup, device: Union[int, str, torch.device] + ) -> None: + """ + Quick allreduce leverages quantization for further + acceleration on ROCm. It currently supports FP8, Q6, and Q4 + quantization formats and FP(float16, bfloat16). + Quick allreduce is designed as a complement to custom allreduce. + Its initialization requires even stricter conditions. + Only the ROCm HYGON series is supported for quick allreduce at + this time. + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the CustomAllreduce to. If None, + it will be bind to f"cuda:{local_rank}". + It is the caller's responsibility to make sure each communicator + is bind to a unique device, and all communicators in this group + are in the same node. + """ + self.disabled = True + if not qr_rocm_arch_available(): + logger.debug( + "Custom quick allreduce is limited supported." + ) + return + + if not quick_ar: + return + + self.group = group + assert ( + dist.get_backend(group) != dist.Backend.NCCL + ), "Custom quick allreduce should be attached to a non-NCCL group." + if not all(in_the_same_node_as(group, source_rank=0)): + # No need to initialize custom quick allreduce for + # multi-node case. + logger.warning( + "Custom quick allreduce is disabled because this " + "process group spans across nodes." + ) + return + rank = dist.get_rank(group=self.group) + world_size = dist.get_world_size(group=self.group) + self.rank = rank + self.world_size = world_size + if world_size == 1: + # No need to initialize QuickReduce for single GPU case. + return + + if world_size not in QuickAllReduce._SUPPORTED_WORLD_SIZES: + logger.warning( + "Custom quick allreduce is disabled due to an " + "unsupported world size: %d. Supported world sizes: %s.", + world_size, + str(QuickAllReduce._SUPPORTED_WORLD_SIZES), + ) + return + + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + assert isinstance(device, torch.device) + self.device = device + + cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) + if cuda_visible_devices: + device_ids = list(map(int, cuda_visible_devices.split(","))) + else: + device_ids = list(range(torch.cuda.device_count())) + physical_device_id = device_ids[device.index] + tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu") + gather_list = [ + torch.tensor([0], dtype=torch.int, device="cpu") + for _ in range(self.world_size) + ] + dist.all_gather(gather_list, tensor, group=self.group) + physical_device_ids = [t.item() for t in gather_list] + + # test nvlink first, this will filter out most of the cases + # where custom quick allreduce is not supported + # this checks hardware and driver support for NVLink + + # self.fully_connected = is_full_nvlink(physical_device_ids, self.world_size) + self.fully_connected = True + if self.world_size > 2 and not self.fully_connected: + logger.debug( + "Custom quick allreduce is disabled because it's not supported " + "on more than two PCIe-only GPUs. " + ) + return + + self.init_quick_all_reduce() + + def init_quick_all_reduce(self): + # On RocM, bfloat16 kernels are slower than fp16 + # due to slower match operations + # If environment variable is set to 1, we convert input to fp16 + self.use_fp16_kernels = int( + os.environ.get("AITER_QUICK_REDUCE_CAST_BF16_TO_FP16", 1) + ) + regime_str = os.environ.get("AITER_QUICK_REDUCE_QUANTIZATION", "NONE") + if regime_str not in QuickReduceRegime.__members__: + logger.warning( + "Custom quick allreduce:", + f"Invalid quantization level: {regime_str}. " + "Supported levels: " + f"{list(QuickReduceRegime.__members__.keys())}", + ) + return + + if regime_str == "NONE": + logger.debug( + "Custom quick allreduce is disabled based " + "on env variable " + "AITER_QUICK_REDUCE_QUANTIZATION='NONE'" + ) + return + self.qr_quant_level = QuickReduceRegime[regime_str] + + # TODO: If the dtype is not bfloat16 or then float16, + # quickallreduce should not be created. + + # AITER_QUICK_REDUCE_MAX_SIZE_BYTES_MB is specified in MB + qr_max_size = int(os.environ.get("AITER_QUICK_REDUCE_MAX_SIZE_BYTES_MB", 0)) + if qr_max_size > 0: + if qr_max_size < 1: + logger.info( + "You should not set a max_size smaller than 1MB, which can " + "lead to error or degradation to custom allreduce or rccl." + ) + qr_max_size = qr_max_size * MB + # If qr_max_size is None, then 2GB is used by default. + self._ptr = ops.init_custom_qr(self.rank, self.world_size, qr_max_size) + self.qr_max_size = qr_max_size if qr_max_size > 0 else ops.qr_max_size() + self.create_shared_buffer() + self.disabled = False + + def create_shared_buffer(self): + """ + Creates a shared buffer for quickreduce. + Has to be called after init_custom_qr + """ + handle = ops.qr_get_handle(self._ptr) + world_size = dist.get_world_size(group=self.group) + handles = [None] * world_size + dist.all_gather_object(handles, handle, group=self.group) + ops.qr_open_handles(self._ptr, handles) + + def should_quick_allreduce(self, inp: torch.Tensor): + """ + Check if quickreduce is available + """ + if self.disabled: + return False + if inp.dtype not in self._SUPPORTED_DTYPES: + return False + inp_size = inp.numel() * inp.element_size() + # custom quick allreduce requires input byte size to be + # multiples of 16 + if inp_size % 16 != 0: + return False + if not is_weak_contiguous(inp): + return False + dtype = inp.dtype + if self.use_fp16_kernels: + dtype = torch.float16 + return ( + inp_size <= self.qr_max_size + and inp_size + >= self._QR_MIN_SIZE[(dtype, self.world_size)][self.qr_quant_level.value] + ) + + def quick_all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None): + """Performs an out-of-place custom quick all reduce.""" + # quick allreduce doesn't require a separate graph mode, + # as QR uses static IPC buffer. + if out is None: + out = torch.empty_like(inp) + ops.qr_all_reduce( + self._ptr, inp, out, self.qr_quant_level.value, self.use_fp16_kernels + ) + return out + + def close(self): + if not self.disabled and getattr(self, "_ptr", None): + if ops is not None: + ops.qr_destroy(self._ptr) + self._ptr = 0 + self.disabled = True + + def __del__(self): + self.close() diff --git a/aiter/dist/parallel_state.py b/aiter/dist/parallel_state.py new file mode 100644 index 0000000000000000000000000000000000000000..2ddc3c2ee620b5459f4f3f40d4b7003510c7f340 --- /dev/null +++ b/aiter/dist/parallel_state.py @@ -0,0 +1,1359 @@ +# Copyright (C) 2023-2025 The vLLM team. +"""vLLM distributed state. +It takes over the control of the distributed environment from PyTorch. +The typical workflow is: + +- call `init_distributed_environment` to initialize the distributed environment. +- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to + initialize the model parallel groups. + +- any code dealing with the distributed stuff + +- call `destroy_model_parallel` to destroy the model parallel groups. +- call `destroy_distributed_environment` to destroy the distributed environment. + +If you only need to use the distributed environment without model/pipeline + parallelism, you can skip the model parallel initialization and destruction + steps. +""" +import contextlib +import pickle +import weakref +from collections import namedtuple +from contextlib import contextmanager, nullcontext +from dataclasses import dataclass +from multiprocessing import shared_memory +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from unittest.mock import patch + +import torch +import torch.distributed +from torch.distributed import Backend, ProcessGroup + +import os +from aiter import logger +from aiter.jit.utils.torch_guard import torch_compile_guard + + +def supports_custom_op(): + return True + + +@dataclass +class GraphCaptureContext: + stream: torch.cuda.Stream + + +TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"]) + + +def _split_tensor_dict( + tensor_dict: Dict[str, Union[torch.Tensor, Any]], +) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]: + """Split the tensor dictionary into two parts: + 1. A list of (key, value) pairs. If the value is a tensor, it is replaced + by its metadata. + 2. A list of tensors. + """ + metadata_list: List[Tuple[str, Any]] = [] + tensor_list: List[torch.Tensor] = [] + for key, value in tensor_dict.items(): + if isinstance(value, torch.Tensor): + # Note: we cannot use `value.device` here, + # because it contains not only the device type but also the device + # index (e.g. "cuda:0"). We only need the device type. + # receiving side will set the device index. + device = value.device.type + metadata_list.append( + (key, TensorMetadata(device, value.dtype, value.size())) + ) + tensor_list.append(value) + else: + metadata_list.append((key, value)) + return metadata_list, tensor_list + + +_group_name_counter: Dict[str, int] = {} + + +def _get_unique_name(name: str) -> str: + """Get a unique name for the group. + Example: + _get_unique_name("tp") -> "tp:0" + _get_unique_name("tp") -> "tp:1" + """ + if name not in _group_name_counter: + _group_name_counter[name] = 0 + newname = f"{name}:{_group_name_counter[name]}" + _group_name_counter[name] += 1 + return newname + + +_groups: Dict[str, Callable[[], "GroupCoordinator"]] = {} + + +def _register_group(group: "GroupCoordinator") -> None: + # looks like Python 3.8 does not understand `ReferenceType` + _groups[group.unique_name] = weakref.ref(group) # type: ignore + + +def all_reduce_fake( + tensor: torch.Tensor, group_name: str, ca_fp8_quant: bool +) -> torch.Tensor: + return torch.empty_like(tensor) + + +# There is same name all_reduce in aiter.op, use Alias +@torch_compile_guard(gen_fake=all_reduce_fake) +def all_reduce_( + tensor: torch.Tensor, group_name: str, ca_fp8_quant: bool +) -> torch.Tensor: + assert group_name in _groups, f"Group {group_name} is not found." + group = _groups[group_name]() + if group is None: + raise ValueError(f"Group {group_name} is destroyed.") + return group._all_reduce_out_place(tensor, ca_fp8_quant) + + +def fused_allreduce_rmsnorm_fake( + inp: torch.Tensor, + res_inp: torch.Tensor, + w: torch.Tensor, + eps: float, + group_name: str, +) -> torch.Tensor: + return torch.empty_like(inp) + + +@torch_compile_guard(gen_fake=fused_allreduce_rmsnorm_fake) +def fused_allreduce_rmsnorm_( + inp: torch.Tensor, + res_inp: torch.Tensor, + w: torch.Tensor, + eps: float, + group_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + assert group_name in _groups, f"Group {group_name} is not found." + group = _groups[group_name]() + if group is None: + raise ValueError(f"Group {group_name} is destroyed.") + return group._fused_allreduce_rmsnorm_out_place(inp, res_inp, w, eps) + + +if supports_custom_op(): + + # @torch.library.custom_op("aiter::outplace_all_gather", mutates_args=[]) + def outplace_all_gather(input: torch.Tensor, group_name: str) -> torch.Tensor: + assert group_name in _groups, f"Group {group_name} is not found." + group = _groups[group_name]() + if group is None: + raise ValueError(f"Group {group_name} is destroyed.") + return group._all_gather_out_place(input) + + +class GroupCoordinator: + """ + PyTorch ProcessGroup wrapper for a group of processes. + PyTorch ProcessGroup is bound to one specific communication backend, + e.g. NCCL, Gloo, MPI, etc. + GroupCoordinator takes charge of all the communication operations among + the processes in the group. It manages both CPU and device + communication. + """ + + # available attributes: + rank: int # global rank + ranks: List[int] # global ranks in the group + world_size: int # size of the group + # difference between `local_rank` and `rank_in_group`: + # if we have a group of size 4 across two nodes: + # Process | Node | Rank | Local Rank | Rank in Group + # 0 | 0 | 0 | 0 | 0 + # 1 | 0 | 1 | 1 | 1 + # 2 | 1 | 2 | 0 | 2 + # 3 | 1 | 3 | 1 | 3 + local_rank: int # local rank used to assign devices + rank_in_group: int # rank inside the group + cpu_group: ProcessGroup # group for CPU communication + device_group: ProcessGroup # group for device communication + use_pynccl: bool # a hint of whether to use PyNccl + use_custom_allreduce: bool # a hint of whether to use CustomAllreduce + # communicators are only created for world size > 1 + pynccl_comm: Optional[Any] # PyNccl communicator + ca_comm: Optional[Any] # Custom allreduce communicator + qr_comm: Optional[Any] # Quick allreduce communicator + mq_broadcaster: Optional[Any] # shared memory broadcaster + + def __init__( + self, + group_ranks: List[List[int]], + local_rank: int, + torch_distributed_backend: Union[str, Backend], + use_device_communicator: bool, # whether to use device communicator + use_message_queue_broadcaster: bool = False, + group_name: Optional[str] = None, + ): + group_name = group_name or "anonymous" + self.unique_name = _get_unique_name(group_name) + _register_group(self) + + self.rank = torch.distributed.get_rank() + self.local_rank = local_rank + + self_device_group = None + self_cpu_group = None + + for ranks in group_ranks: + device_group = torch.distributed.new_group( + ranks, backend=torch_distributed_backend + ) + # a group with `gloo` backend, to allow direct coordination between + # processes through the CPU. + cpu_group = torch.distributed.new_group(ranks, backend="gloo") + if self.rank in ranks: + self.ranks = ranks + self.world_size = len(ranks) + self.rank_in_group = ranks.index(self.rank) + self_device_group = device_group + self_cpu_group = cpu_group + + assert self_cpu_group is not None + assert self_device_group is not None + + self.cpu_group = self_cpu_group + self.device_group = self_device_group + + self.device = torch.device(f"cuda:{local_rank}") + + self.use_device_communicator = use_device_communicator + logger.debug( + f"Initialized GroupCoordinator {self.unique_name} with " + f"ranks={self.ranks}, local_rank={self.local_rank}, " + f"world_size={self.world_size}, " + f"torch_distributed_backend={torch_distributed_backend}, " + f"use_device_communicator={self.use_device_communicator}" + ) + self.device_communicator = None + if use_device_communicator and self.world_size > 1: + from .device_communicators.communicator_cuda import CudaCommunicator + + self.device_communicator = CudaCommunicator( + cpu_group=self.cpu_group, + device=self.device, + device_group=self.device_group, + unique_name=self.unique_name, + ) + + from .shm_broadcast import MessageQueue + + self.mq_broadcaster = None + if use_message_queue_broadcaster and self.world_size > 1: + self.mq_broadcaster = MessageQueue.create_from_process_group( + self.cpu_group, 1 << 22, 6 + ) + + @property + def first_rank(self): + """Return the global rank of the first process in the group""" + return self.ranks[0] + + @property + def last_rank(self): + """Return the global rank of the last process in the group""" + return self.ranks[-1] + + @property + def is_first_rank(self): + """Return whether the caller is the first process in the group""" + return self.rank == self.first_rank + + @property + def is_last_rank(self): + """Return whether the caller is the last process in the group""" + return self.rank == self.last_rank + + @property + def next_rank(self): + """Return the global rank of the process that follows the caller""" + rank_in_group = self.rank_in_group + world_size = self.world_size + return self.ranks[(rank_in_group + 1) % world_size] + + @property + def prev_rank(self): + """Return the global rank of the process that precedes the caller""" + rank_in_group = self.rank_in_group + world_size = self.world_size + return self.ranks[(rank_in_group - 1) % world_size] + + @contextmanager + def graph_capture( + self, graph_capture_context: Optional[GraphCaptureContext] = None + ): + if graph_capture_context is None: + stream = torch.cuda.Stream() + graph_capture_context = GraphCaptureContext(stream) + else: + stream = graph_capture_context.stream + + # only cuda uses this function, + # so we don't abstract it into the base class + maybe_ca_context = nullcontext() + from aiter.dist.device_communicators.communicator_cuda import ( + CudaCommunicator, + ) + + if self.device_communicator is not None: + assert isinstance(self.device_communicator, CudaCommunicator) + ca_comm = self.device_communicator.ca_comm + if ca_comm is not None: + maybe_ca_context = ca_comm.capture() # type: ignore + + # ensure all initialization operations complete before attempting to + # capture the graph on another stream + curr_stream = torch.cuda.current_stream() + if curr_stream != stream: + stream.wait_stream(curr_stream) + + with torch.cuda.stream(stream), maybe_ca_context: + yield graph_capture_context + + def all_reduce( + self, input_: torch.Tensor, ca_fp8_quant: bool = False + ) -> torch.Tensor: + """ + User-facing all-reduce function before we actually call the + all-reduce operation. + + We need this because Dynamo does not support passing an arbitrary + object (`self` in this case) to a custom op. We need to pass the + group name as a string, and then look up the group coordinator from + the group name, dispatch the all-reduce operation to the group + coordinator. + + In addition, PyTorch custom ops do not support mutation or returning + a new tensor in the same op. So we always make the all-reduce operation + out-of-place. + """ + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return input_ + + return all_reduce_( + input_, group_name=self.unique_name, ca_fp8_quant=ca_fp8_quant + ) + + def _all_reduce_out_place( + self, input_: torch.Tensor, ca_fp8_quant: bool + ) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.all_reduce(input_, ca_fp8_quant) + + def fused_allreduce_rmsnorm( + self, + input_: torch.Tensor, + residual_inp_: torch.Tensor, + weight_: torch.Tensor, + eps: float, + ) -> tuple[torch.Tensor, torch.Tensor]: + return fused_allreduce_rmsnorm_( + input_, residual_inp_, weight_, eps, group_name=self.unique_name + ) + + def _fused_allreduce_rmsnorm_out_place( + self, + input_: torch.Tensor, + residual_inp_: torch.Tensor, + weight_: torch.Tensor, + eps: float, + ) -> tuple[torch.Tensor, torch.Tensor]: + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.fused_allreduce_rmsnorm( + input_, residual_inp_, weight_, eps + ) + + def _all_gather_out_place(self, input_: torch.Tensor) -> torch.Tensor: + ca_comm = self.device_communicator.ca_comm + assert ca_comm is not None + assert not ca_comm.disabled + out = ca_comm.custom_all_gather(input_) + assert out is not None + return out + + def custom_all_gather(self, input_: torch.Tensor) -> torch.Tensor: + return outplace_all_gather(input_, group_name=self.unique_name) + + def reduce_scatter(self, input_: torch.Tensor, dim: int = -1): + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.reduce_scatter(input_, dim) + + + def all_gather( + self, input_: torch.Tensor, use_custom: bool = False, dim: int = -1 + ) -> torch.Tensor: + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert ( + -input_.dim() <= dim < input_.dim() + ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + input_size = input_.size() + if use_custom: + output_tensor = outplace_all_gather(input_, group_name=self.unique_name) + output_tensor = output_tensor.reshape((world_size,) + input_size) + else: + # Allocate output tensor. + output_tensor = torch.empty( + (world_size,) + input_size, dtype=input_.dtype, device=input_.device + ) + # All-gather. + torch.distributed.all_gather_into_tensor( + output_tensor, input_, group=self.device_group + ) + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape( + input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :] + ) + return output_tensor + + def gather( + self, input_: torch.Tensor, dst: int = 0, dim: int = -1 + ) -> Optional[torch.Tensor]: + """ + NOTE: We assume that the input tensor is on the same device across + all the ranks. + NOTE: `dst` is the local rank of the destination rank. + """ + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert ( + -input_.dim() <= dim < input_.dim() + ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + # Allocate output tensor. + if self.rank_in_group == dst: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + else: + gather_list = None + # Gather. + torch.distributed.gather( + input_, gather_list, dst=self.ranks[dst], group=self.device_group + ) + if self.rank_in_group == dst: + output_tensor = torch.cat(gather_list, dim=dim) + else: + output_tensor = None + return output_tensor + + def broadcast(self, input_: torch.Tensor, src: int = 0): + """Broadcast the input tensor. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return input_ + # Broadcast. + torch.distributed.broadcast( + input_, src=self.ranks[src], group=self.device_group + ) + return input_ + + def broadcast_object(self, obj: Optional[Any] = None, src: int = 0): + """Broadcast the input object. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return obj + if self.mq_broadcaster is not None: + assert src == 0, "Message queue broadcaster only supports src=0" + return self.mq_broadcaster.broadcast_object(obj) + if self.rank_in_group == src: + torch.distributed.broadcast_object_list( + [obj], src=self.ranks[src], group=self.cpu_group + ) + return obj + else: + recv = [None] + torch.distributed.broadcast_object_list( + recv, src=self.ranks[src], group=self.cpu_group + ) + return recv[0] + + def broadcast_object_list( + self, obj_list: List[Any], src: int = 0, group: Optional[ProcessGroup] = None + ): + """Broadcast the input object list. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return obj_list + # Broadcast. + torch.distributed.broadcast_object_list( + obj_list, src=self.ranks[src], group=self.device_group + ) + return obj_list + + def send_object(self, obj: Any, dst: int) -> None: + """Send the input object list to the destination rank.""" + """NOTE: `dst` is the local rank of the destination rank.""" + + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + assert dst != self.rank_in_group, ( + "Invalid destination rank. Destination rank is the same " + "as the current rank." + ) + + # Serialize object to tensor and get the size as well + object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8) + + size_tensor = torch.tensor( + [object_tensor.numel()], dtype=torch.long, device="cpu" + ) + + # Send object size + + torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group) + + # Send object + torch.distributed.send(object_tensor, dst=self.ranks[dst], group=self.cpu_group) + + return None + + def recv_object(self, src: int) -> Any: + """Receive the input object list from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + + assert src < self.world_size, f"Invalid src rank ({src})" + + assert ( + src != self.rank_in_group + ), "Invalid source rank. Source rank is the same as the current rank." + + size_tensor = torch.empty(1, dtype=torch.long, device="cpu") + + # Receive object size + rank_size = torch.distributed.recv( + size_tensor, src=self.ranks[src], group=self.cpu_group + ) + + # Tensor to receive serialized objects into. + object_tensor = torch.empty( # type: ignore[call-overload] + size_tensor.item(), # type: ignore[arg-type] + dtype=torch.uint8, + device="cpu", + ) + + rank_object = torch.distributed.recv( + object_tensor, src=self.ranks[src], group=self.cpu_group + ) + + assert ( + rank_object == rank_size + ), "Received object sender rank does not match the size sender rank." + + obj = pickle.loads(object_tensor.numpy().tobytes()) + + return obj + + def broadcast_tensor_dict( + self, + tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None, + src: int = 0, + group: Optional[ProcessGroup] = None, + metadata_group: Optional[ProcessGroup] = None, + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + """Broadcast the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return tensor_dict + + group = self.device_group + metadata_group = self.cpu_group + assert src < self.world_size, f"Invalid src rank ({src})" + + rank_in_group = self.rank_in_group + if rank_in_group == src: + metadata_list: List[Tuple[Any, Any]] = [] + assert isinstance( + tensor_dict, dict + ), f"Expecting a dictionary, got {type(tensor_dict)}" + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `broadcast_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.broadcast_object(metadata_list, src=src) + async_handles = [] + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast( + tensor, src=self.ranks[src], group=metadata_group, async_op=True + ) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast( + tensor, src=self.ranks[src], group=group, async_op=True + ) + async_handles.append(handle) + for async_handle in async_handles: + async_handle.wait() + + else: + metadata_list = self.broadcast_object(None, src=src) + tensor_dict = {} + async_handles = [] + for key, value in metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty( + value.size, dtype=value.dtype, device=value.device + ) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast( + tensor, + src=self.ranks[src], + group=metadata_group, + async_op=True, + ) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast( + tensor, src=self.ranks[src], group=group, async_op=True + ) + async_handles.append(handle) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + for async_handle in async_handles: + async_handle.wait() + return tensor_dict + + def send_tensor_dict( + self, + tensor_dict: Dict[str, Union[torch.Tensor, Any]], + dst: Optional[int] = None, + all_gather_group: Optional["GroupCoordinator"] = None, + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + """Send the input tensor dictionary. + NOTE: `dst` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return tensor_dict + + all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size + all_gather_rank = ( + 0 if all_gather_group is None else all_gather_group.rank_in_group + ) + + group = self.device_group + metadata_group = self.cpu_group + + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + metadata_list: List[Tuple[Any, Any]] = [] + assert isinstance( + tensor_dict, dict + ), f"Expecting a dictionary, got {type(tensor_dict)}" + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `send_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.send_object(metadata_list, dst=dst) + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip sending empty tensors. + continue + + # send-allgather: send only a slice, then do allgather. + if all_gather_group is not None and tensor.numel() % all_gather_size == 0: + tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank] + + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.send( + tensor, dst=self.ranks[dst], group=metadata_group + ) + else: + # use group for GPU tensors + torch.distributed.send(tensor, dst=self.ranks[dst], group=group) + return None + + def recv_tensor_dict( + self, + src: Optional[int] = None, + all_gather_group: Optional["GroupCoordinator"] = None, + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + """Recv the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return None + + all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size + all_gather_rank = ( + 0 if all_gather_group is None else all_gather_group.rank_in_group + ) + + group = self.device_group + metadata_group = self.cpu_group + + if src is None: + src = (self.rank_in_group - 1) % self.world_size + assert src < self.world_size, f"Invalid src rank ({src})" + + recv_metadata_list = self.recv_object(src=src) + tensor_dict: Dict[str, Any] = {} + for key, value in recv_metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, dtype=value.dtype, device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + + # send-allgather: send only a slice, then do allgather. + use_all_gather = ( + all_gather_group is not None + and tensor.numel() % all_gather_size == 0 + ) + + if use_all_gather: + orig_shape = tensor.shape + tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank] + + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.recv( + tensor, src=self.ranks[src], group=metadata_group + ) + else: + # use group for GPU tensors + torch.distributed.recv(tensor, src=self.ranks[src], group=group) + if use_all_gather: + # do the allgather + tensor = all_gather_group.all_gather(tensor, dim=0) # type: ignore + tensor = tensor.reshape(orig_shape) + + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + return tensor_dict + + def barrier(self): + """Barrier synchronization among the group. + NOTE: don't use `device_group` here! `barrier` in NCCL is + terrible because it is internally a broadcast operation with + secretly created GPU tensors. It is easy to mess up the current + device. Use the CPU group instead. + """ + torch.distributed.barrier(group=self.cpu_group) + + def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None: + """Sends a tensor to the destination rank in a non-blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.send(tensor, dst) + else: + torch.distributed.send(tensor, self.ranks[dst], self.device_group) + + def recv( + self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None + ) -> torch.Tensor: + """Receives a tensor from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + if src is None: + src = (self.rank_in_group - 1) % self.world_size + + tensor = torch.empty(size, dtype=dtype, device=self.device) + pynccl_comm = self.pynccl_comm + if pynccl_comm is not None and not pynccl_comm.disabled: + pynccl_comm.recv(tensor, src) + else: + torch.distributed.recv(tensor, self.ranks[src], self.device_group) + return tensor + + def destroy(self): + if hasattr(self, "device_group"): + torch.distributed.destroy_process_group(self.device_group) + del self.device_group + if hasattr(self, "cpu_group"): + torch.distributed.destroy_process_group(self.cpu_group) + del self.cpu_group + if self.device_communicator is not None: + self.device_communicator.destroy() + if self.mq_broadcaster is not None: + self.mq_broadcaster = None + + +_WORLD: Optional[GroupCoordinator] = None + + +def get_world_group() -> GroupCoordinator: + assert _WORLD is not None, "world group is not initialized" + return _WORLD + + +def init_world_group( + ranks: List[int], local_rank: int, backend: str +) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=[ranks], + local_rank=local_rank, + torch_distributed_backend=backend, + use_device_communicator=False, + group_name="world", + ) + + +def init_model_parallel_group( + group_ranks: List[List[int]], + local_rank: int, + backend: str, + use_message_queue_broadcaster: bool = False, + group_name: Optional[str] = None, +) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=group_ranks, + local_rank=local_rank, + torch_distributed_backend=backend, + use_device_communicator=True, + use_message_queue_broadcaster=use_message_queue_broadcaster, + group_name=group_name, + ) + + +_TP: Optional[GroupCoordinator] = None + + +def get_tp_group() -> GroupCoordinator: + assert _TP is not None, "tensor model parallel group is not initialized" + return _TP + + +# kept for backward compatibility +get_tensor_model_parallel_group = get_tp_group + +_PP: Optional[GroupCoordinator] = None + + +def get_pp_group() -> GroupCoordinator: + assert _PP is not None, "pipeline model parallel group is not initialized" + return _PP + + + +_DP: Optional[GroupCoordinator] = None + + +def get_dp_group() -> GroupCoordinator: + assert _DP is not None, "data parallel group is not initialized" + return _DP + + +_EP: Optional[GroupCoordinator] = None + + +def get_ep_group() -> GroupCoordinator: + assert _EP is not None, "expert parallel group is not initialized" + return _EP + + +# kept for backward compatibility +get_pipeline_model_parallel_group = get_pp_group + + +@contextmanager +def graph_capture(): + """ + `graph_capture` is a context manager which should surround the code that + is capturing the CUDA graph. Its main purpose is to ensure that the + some operations will be run after the graph is captured, before the graph + is replayed. It returns a `GraphCaptureContext` object which contains the + necessary data for the graph capture. Currently, it only contains the + stream that the graph capture is running on. This stream is set to the + current CUDA stream when the context manager is entered and reset to the + default stream when the context manager is exited. This is to ensure that + the graph capture is running on a separate stream from the default stream, + in order to explicitly distinguish the kernels to capture + from other kernels possibly launched on background in the default stream. + """ + with get_tp_group().graph_capture() as context, get_pp_group().graph_capture( + context + ): + yield context + + +_ENABLE_CUSTOM_ALL_REDUCE = True + + +def set_custom_all_reduce(enable: bool): + global _ENABLE_CUSTOM_ALL_REDUCE + _ENABLE_CUSTOM_ALL_REDUCE = enable + + +def init_distributed_environment( + world_size: int = -1, + rank: int = -1, + distributed_init_method: str = "env://", + local_rank: int = -1, + backend: str = "nccl", + data_parallel_size: int = 1, + data_parallel_rank: int = 0, +): + logger.debug( + "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s", + world_size, + rank, + local_rank, + distributed_init_method, + backend, + ) + if data_parallel_size > 1: + # Adjust the rank and world size for data parallel + rank = data_parallel_rank * world_size + rank + world_size = data_parallel_size * world_size + if not torch.distributed.is_initialized(): + assert distributed_init_method is not None, ( + "distributed_init_method must be provided when initializing " + "distributed environment" + ) + if "HIP_VISIBLE_DEVICES" not in os.environ: + from .utils import update_environment_variables + + update_environment_variables( + {"HIP_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))} + ) + + torch.distributed.init_process_group( + backend=backend, + init_method=distributed_init_method, + world_size=world_size, + rank=rank, + ) + # set the local rank + # local_rank is not available in torch ProcessGroup, + # see https://github.com/pytorch/pytorch/issues/122816 + if local_rank == -1: + # local rank not set, this usually happens in single-node + # setting, where we can use rank as local rank + if distributed_init_method == "env://": + # local_rank = envs.LOCAL_RANK + local_rank = os.environ.get("LOCAL_RANK", rank) + else: + local_rank = rank + global _WORLD + if _WORLD is None: + ranks = list(range(torch.distributed.get_world_size())) + _WORLD = init_world_group(ranks, local_rank, backend) + else: + assert ( + _WORLD.world_size == torch.distributed.get_world_size() + ), "world group already initialized with a different world size" + + +def initialize_model_parallel( + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + # decode_context_model_parallel_size: Optional[int] = 1, + backend: Optional[str] = None, + data_parallel_size: int = 1, +) -> None: + """ + Initialize model parallel groups. + + Arguments: + tensor_model_parallel_size: number of GPUs used for tensor model + parallelism. + pipeline_model_parallel_size: number of GPUs used for pipeline model + parallelism. + backend: name of torch distributed communication backend. + + Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we + use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize + the model pipeline. The present function will + create 4 tensor model-parallel groups and 2 pipeline model-parallel groups: + 4 tensor model-parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7] + 2 pipeline model-parallel groups: + [g0, g2, g4, g6], [g1, g3, g5, g7] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size: int = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + backend = backend or torch.distributed.get_backend(get_world_group().device_group) + + # data_parallel_size = 1 + # from vllm.config import get_current_vllm_config + + # config = get_current_vllm_config() + # if config is not None: + # data_parallel_size = config.parallel_config.data_parallel_size + + # the layout order is: ExternalDP x DP x PP x TP + # ExternalDP is the data parallel group that is not part of the model, + # every dp rank can generate independently (in verl integration). + # DP is the data parallel group that is part of the model, + # all the ranks in the same DP group should generate simultaneously, + # i.e. the `generate` call in the same DP group should be called together, + # otherwise it will cause deadlock. + # to get group_ranks for each dimension, transpose that dimension to the + # last dimension, then reshape to 2D, then unbind the last dimension + all_ranks = torch.arange(world_size).reshape( + -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size + ) # noqa + + # Build the tensor model-parallel groups. + global _TP + assert _TP is None, "tensor model parallel group is already initialized" + group_ranks = all_ranks.view(-1, tensor_model_parallel_size).unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + + # message queue broadcaster is only used in tensor model parallel group + _TP = init_model_parallel_group( + group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="tp", + ) + + # # Build the DCP model-parallel groups. + # global _DCP + # assert _DCP is None, "decode context model parallel group is already initialized" + # # Note(hc): In the current implementation of decode context parallel, + # # dcp_size must not exceed tp_size, because the world size does not + # # change by DCP, it simply reuses the GPUs of TP group, and split one + # # TP group into tp_size//dcp_size DCP groups. + # group_ranks = all_ranks.reshape(-1, decode_context_model_parallel_size).unbind(0) + # group_ranks = [x.tolist() for x in group_ranks] + # _DCP = init_model_parallel_group( + # group_ranks, + # get_world_group().local_rank, + # backend, + # use_message_queue_broadcaster=True, + # group_name="dcp", + # ) + + # Build the pipeline model-parallel groups. + global _PP + assert _PP is None, "pipeline model parallel group is already initialized" + group_ranks = ( + all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0) + ) + group_ranks = [x.tolist() for x in group_ranks] + _PP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="pp" + ) + + global _DP + assert _DP is None, "data parallel group is already initialized" + group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + _DP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="dp" + ) + + global _EP + assert _EP is None, "expert parallel group is already initialized" + group_ranks = ( + all_ranks.transpose(1, 2) + .reshape(-1, data_parallel_size * tensor_model_parallel_size) + .unbind(0) + ) + group_ranks = [x.tolist() for x in group_ranks] + _EP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="ep" + ) + + logger.info( + "rank %s in world size %s is assigned as " + "DP rank %s, PP rank %s, TP rank %s, EP rank %s", + rank, + world_size, + _DP.rank_in_group, + _PP.rank_in_group, + _TP.rank_in_group, + _EP.rank_in_group, + ) + + +def ensure_model_parallel_initialized( + tensor_model_parallel_size: int, + pipeline_model_parallel_size: int, + backend: Optional[str] = None, + data_parallel_size: int = 1, +) -> None: + """Helper to initialize model parallel groups if they are not initialized, + or ensure tensor-parallel and pipeline-parallel sizes are equal to expected + values if the model parallel groups are initialized. + """ + backend = backend or torch.distributed.get_backend(get_world_group().device_group) + if not model_parallel_is_initialized(): + initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + backend, + data_parallel_size, + ) + return + + assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, ( + "tensor parallel group already initialized, but of unexpected size: " + f"{get_tensor_model_parallel_world_size()=} vs. " + f"{tensor_model_parallel_size=}" + ) + pp_world_size = get_pp_group().world_size + assert pp_world_size == pipeline_model_parallel_size, ( + "pipeline parallel group already initialized, but of unexpected size: " + f"{pp_world_size=} vs. " + f"{pipeline_model_parallel_size=}" + ) + + +def model_parallel_is_initialized(): + """Check if tensor and pipeline parallel groups are initialized.""" + return _TP is not None and _PP is not None + + +_TP_STATE_PATCHED = False + + +@contextmanager +def patch_tensor_parallel_group(tp_group: GroupCoordinator): + """Patch the tp group temporarily until this function ends. + + This method is for draft workers of speculative decoding to run draft model + with different tp degree from that of target model workers. + + Args: + tp_group (GroupCoordinator): the tp group coordinator + """ + global _TP_STATE_PATCHED + assert not _TP_STATE_PATCHED, "Should not call when it's already patched" + + _TP_STATE_PATCHED = True + old_tp_group = get_tp_group() + global _TP + _TP = tp_group + try: + yield + finally: + # restore the original state + _TP_STATE_PATCHED = False + _TP = old_tp_group + + +def get_tensor_model_parallel_world_size(): + """Return world size for the tensor model parallel group.""" + return get_tp_group().world_size + + +def get_tensor_model_parallel_rank(): + """Return my rank for the tensor model parallel group.""" + return get_tp_group().rank_in_group + + +def destroy_model_parallel(): + """Set the groups to none and destroy them.""" + global _TP + if _TP: + _TP.destroy() + _TP = None + + global _PP + if _PP: + _PP.destroy() + _PP = None + + +def destroy_distributed_environment(): + global _WORLD + if _WORLD: + _WORLD.destroy() + _WORLD = None + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]: + """ + This is a collective operation that returns if each rank is in the same node + as the source rank. It tests if processes are attached to the same + memory system (shared access to shared memory). + """ + assert isinstance(pg, ProcessGroup), "pg should be a ProcessGroup instance." + assert ( + torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL + ), "in_the_same_node_as should be tested with a non-NCCL group." + # local rank inside the group + rank = torch.distributed.get_rank(group=pg) + world_size = torch.distributed.get_world_size(group=pg) + + # global ranks of the processes in the group + ranks = torch.distributed.get_process_group_ranks(pg) + + # local tensor in each process to store the result + is_in_the_same_node = torch.tensor( + [0] * world_size, dtype=torch.int32, device="cpu" + ) + + magic_message = b"magic_message" + shm = None + + try: + with contextlib.suppress(OSError): + if rank == source_rank: + # create a shared memory segment + shm = shared_memory.SharedMemory(create=True, size=128) + shm.buf[: len(magic_message)] = magic_message + torch.distributed.broadcast_object_list( + [shm.name], src=ranks[source_rank], group=pg + ) + is_in_the_same_node[rank] = 1 + else: + # try to open the shared memory segment + recv = [None] + torch.distributed.broadcast_object_list( + recv, src=ranks[source_rank], group=pg + ) + name = recv[0] + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + shm = shared_memory.SharedMemory(name=name) + if shm.buf[: len(magic_message)] == magic_message: + is_in_the_same_node[rank] = 1 + except Exception as e: + logger.error("Error ignored in is_in_the_same_node: %s", e) + finally: + if shm: + shm.close() + + torch.distributed.barrier(group=pg) + + # clean up the shared memory segment + with contextlib.suppress(OSError): + if rank == source_rank and shm: + shm.unlink() + torch.distributed.all_reduce(is_in_the_same_node, group=pg) + + return [x == 1 for x in is_in_the_same_node.tolist()] + + +def is_global_first_rank() -> bool: + """ + Check if the current process is the first rank globally across all + parallelism strategies (PP, TP, DP, EP, etc.). + + Unlike group-specific checks like `get_tensor_model_parallel_rank() == 0` + or `get_pp_group().is_first_rank`, this function checks the global rank + across all parallelism dimensions. + + Returns: + bool: True if this is the global first rank (rank 0), False otherwise. + Returns True if distributed is not initialized (single process). + """ + try: + # If world group is available, use it for the most accurate check + global _WORLD + if _WORLD is not None: + return _WORLD.is_first_rank + + # If torch distributed is not initialized, assume single process + if not torch.distributed.is_initialized(): + return True + + # Fallback to torch's global rank + return torch.distributed.get_rank() == 0 + + except Exception: + # If anything goes wrong, assume this is the first rank + return True + + +def _node_count(pg: ProcessGroup) -> int: + """ + Returns the total number of nodes in the process group. + + Args: + pg: The process group to analyze + + Returns: + int: The total number of nodes + """ + assert isinstance(pg, ProcessGroup), "pg should be a ProcessGroup instance." + if isinstance(pg, ProcessGroup): + world_size = torch.distributed.get_world_size(group=pg) + else: + world_size = pg.world_size + + if world_size == 1: + return 1 + + # Build node assignment map + node_assignment = [0] * world_size # rank -> node_id + next_node_id = 0 + + for current_rank in range(world_size): + if node_assignment[current_rank] != 0: + continue # Already assigned to a node + + # Assign current rank to a new node + next_node_id += 1 + node_assignment[current_rank] = next_node_id + + # Find all ranks on the same node as current_rank + same_node_flags = in_the_same_node_as(pg, current_rank) + for other_rank, is_same_node in enumerate(same_node_flags): + if is_same_node and node_assignment[other_rank] == 0: + node_assignment[other_rank] = next_node_id + + return next_node_id diff --git a/aiter/dist/shm_broadcast.py b/aiter/dist/shm_broadcast.py new file mode 100644 index 0000000000000000000000000000000000000000..ab8d0ac6d5c1d078560a5e66e9d1d55cd1e7c2b4 --- /dev/null +++ b/aiter/dist/shm_broadcast.py @@ -0,0 +1,506 @@ +''' + + * Copyright (c) 2024, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ''' + +import pickle +import time +from contextlib import contextmanager +from dataclasses import dataclass, field +from multiprocessing import shared_memory +from typing import List, Optional +from unittest.mock import patch + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup +from zmq import IPV6 # type: ignore +from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore + +from aiter import logger +from .utils import get_ip, get_open_port, is_valid_ipv6_address + +VLLM_RINGBUFFER_WARNING_INTERVAL = 60 + +# time to wait if the queue is full or empty +# if we sleep for too short, it will consume too much CPU +# if we sleep for too long, it will slow down the writer/reader +# 0.1 us is a good balance +RINGBUFFER_SLEEP_INTERVAL = 1e-7 + + +class ShmRingBuffer: + + def __init__(self, + n_reader: int, + max_chunk_bytes: int, + max_chunks: int, + name: Optional[str] = None): + """ + A shared memory ring buffer implementation for broadcast communication. + Essentially, it is a queue where only one will `enqueue` and multiple + will `dequeue`. The max size of each item, together with the max number + of items that can be stored in the buffer are known in advance. + In this case, we don't need to synchronize the access to + the buffer. + + Buffer memory layout: + data metadata + | | + | (current_idx) | (current_idx) + v v + +-------------------------------+----------------------------------------+ + | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata | + +-------------------------------+----------------------------------------+ + | max_chunks x max_chunk_bytes | max_chunks x (1 + n_reader) bytes | + + metadata memory layout: each byte is a flag, the first byte is the written + flag, and the rest are reader flags. The flags are set to 0 by default. + +--------------+--------------+--------------+-----+--------------+ + | written_flag | reader0_flag | reader1_flag | ... | readerN_flag | + +--------------+--------------+--------------+-----+--------------+ + + The state of metadata is as follows: + + (case 1) 0???...???: the block is not written yet, cannot read, can write + (case 2) 1000...000: the block is just written, can read, cannot write + (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write + (case 4) 1111...111: the block is written and read by all readers, cannot read, can write + + State transition for readers: + + When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read. + Only after the caller finishes reading the block, the reader can mark the block as read. + Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0). + + State transition for writer: + + When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case + to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer + can reset the reader flags to 0, and mark the block as written (from 0 to 1). + NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct. + + During creation, `name` is None and the buffer is created. We can pass the + created object to other processes by pickling it. The other processes will + get the name of the shared memory and open it, so that they can access the + same shared memory buffer. + """ # noqa + self.n_reader = n_reader + self.metadata_size = 1 + n_reader + self.max_chunk_bytes = max_chunk_bytes + self.max_chunks = max_chunks + self.total_bytes_of_buffer = (self.max_chunk_bytes + + self.metadata_size) * self.max_chunks + self.data_offset = 0 + self.metadata_offset = self.max_chunk_bytes * self.max_chunks + + if name is None: + # we are creating a buffer + self.is_creator = True + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.total_bytes_of_buffer) + # initialize the metadata section to 0 + with memoryview(self.shared_memory.buf[self.metadata_offset:] + ) as metadata_buffer: + torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0) + else: + # we are opening an existing buffer + self.is_creator = False + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + try: + self.shared_memory = shared_memory.SharedMemory(name=name) + assert ( + self.shared_memory.size == self.total_bytes_of_buffer) + except FileNotFoundError: + # we might deserialize the object in a different node + # in this case, this object is not used, + # and we should suppress the error + pass + + def __reduce__(self): + return ( + self.__class__, + (self.n_reader, self.max_chunk_bytes, self.max_chunks, + self.shared_memory.name), + ) + + def __del__(self): + if hasattr(self, "shared_memory"): + self.shared_memory.close() + if self.is_creator: + self.shared_memory.unlink() + + @contextmanager + def get_data(self, current_idx: int): + start = self.data_offset + current_idx * self.max_chunk_bytes + end = start + self.max_chunk_bytes + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + @contextmanager + def get_metadata(self, current_idx: int): + start = self.metadata_offset + current_idx * self.metadata_size + end = start + self.metadata_size + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + +@dataclass +class Handle: + connect_ip: str + local_reader_ranks: List[int] = field(default_factory=list) + + buffer: Optional[ShmRingBuffer] = None + local_subscribe_port: Optional[int] = None + remote_subscribe_port: Optional[int] = None + + +class MessageQueue: + + def __init__( + self, + n_reader, # number of all readers + n_local_reader, # number of local readers through shared memory + local_reader_ranks: Optional[List[int]] = None, + max_chunk_bytes: int = 1024 * 1024 * 10, + max_chunks: int = 10, + connect_ip: Optional[str] = None, + ): + if local_reader_ranks is None: + local_reader_ranks = list(range(n_local_reader)) + else: + assert len(local_reader_ranks) == n_local_reader + self.n_local_reader = n_local_reader + n_remote_reader = n_reader - n_local_reader + self.n_remote_reader = n_remote_reader + + if connect_ip is None: + connect_ip = get_ip() if n_remote_reader > 0 else "127.0.0.1" + + context = Context() + + if n_local_reader > 0: + # for local readers, we will: + # 1. create a shared memory ring buffer to communicate small data + # 2. create a publish-subscribe socket to communicate large data + self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes, + max_chunks) + + # XPUB is very similar to PUB, + # except that it can receive subscription messages + # to confirm the number of subscribers + self.local_socket = context.socket(XPUB) + # set the verbose option so that we can receive every subscription + # message. otherwise, we will only receive the first subscription + # see http://api.zeromq.org/3-3:zmq-setsockopt for more details + self.local_socket.setsockopt(XPUB_VERBOSE, True) + local_subscribe_port = get_open_port() + socket_addr = f"tcp://127.0.0.1:{local_subscribe_port}" + logger.debug("Binding to %s", socket_addr) + self.local_socket.bind(socket_addr) + + self.current_idx = 0 + + else: + self.buffer = None # type: ignore + local_subscribe_port = None + self.local_socket = None + self.current_idx = -1 + + if n_remote_reader > 0: + # for remote readers, we will: + # create a publish-subscribe socket to communicate large data + self.remote_socket = context.socket(XPUB) + self.remote_socket.setsockopt(XPUB_VERBOSE, True) + remote_subscribe_port = get_open_port() + if is_valid_ipv6_address(connect_ip): + self.remote_socket.setsockopt(IPV6, 1) + socket_addr = f"tcp://*:{remote_subscribe_port}" + self.remote_socket.bind(socket_addr) + + else: + remote_subscribe_port = None + self.remote_socket = None + + self._is_writer = True + self._is_local_reader = False + self.local_reader_rank = -1 + # rank does not matter for remote readers + self._is_remote_reader = False + + self.handle = Handle( + connect_ip=connect_ip, + local_reader_ranks=local_reader_ranks, + buffer=self.buffer, + local_subscribe_port=local_subscribe_port, + remote_subscribe_port=remote_subscribe_port, + ) + + logger.debug( + "vLLM message queue communication handle: %s", self.handle) + + def export_handle(self) -> Handle: + return self.handle + + @staticmethod + def create_from_handle(handle: Handle, rank) -> "MessageQueue": + self = MessageQueue.__new__(MessageQueue) + self.handle = handle + self._is_writer = False + + context = Context() + + if rank in handle.local_reader_ranks: + assert handle.buffer is not None + self.buffer = handle.buffer + self.current_idx = 0 + self.local_reader_rank = handle.local_reader_ranks.index(rank) + self._is_local_reader = True + self._is_remote_reader = False + + self.local_socket = context.socket(SUB) + self.local_socket.setsockopt_string(SUBSCRIBE, "") + socket_addr = f"tcp://127.0.0.1:{handle.local_subscribe_port}" + logger.debug("Connecting to %s", socket_addr) + self.local_socket.connect(socket_addr) + + self.remote_socket = None + else: + self.buffer = None # type: ignore + self.current_idx = -1 + self.local_reader_rank = -1 + self._is_local_reader = False + self._is_remote_reader = True + + self.local_socket = None + + self.remote_socket = context.socket(SUB) + self.remote_socket.setsockopt_string(SUBSCRIBE, "") + if is_valid_ipv6_address(handle.connect_ip): + self.remote_socket.setsockopt(IPV6, 1) + socket_addr = f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}" + logger.debug("Connecting to %s", socket_addr) + self.remote_socket.connect(socket_addr) + + return self + + def wait_until_ready(self): + """This is a collective operation. All processes (including the + readers and the writer) should call this function. + """ + if self._is_writer: + # wait for all readers to connect + + # local readers + for i in range(self.n_local_reader): + # wait for subscription messages from all local readers + self.local_socket.recv() + if self.n_local_reader > 0: + # send a message to all local readers + # to make sure the publish channel is working + self.local_socket.send(b"READY") + + # remote readers + for i in range(self.n_remote_reader): + # wait for subscription messages from all remote readers + self.remote_socket.recv() + if self.n_remote_reader > 0: + # send a message to all remote readers + # to make sure the publish channel is working + self.remote_socket.send(b"READY") + elif self._is_local_reader: + # wait for the writer to send a message + recv = self.local_socket.recv() + assert recv == b"READY" + elif self._is_remote_reader: + # wait for the writer to send a message + recv = self.remote_socket.recv() + assert recv == b"READY" + + @contextmanager + def acquire_write(self): + assert self._is_writer, "Only writers can acquire write" + start_time = time.monotonic() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_count = sum(metadata_buffer[1:]) + written_flag = metadata_buffer[0] + if written_flag and read_count != self.buffer.n_reader: + # this block is written and not read by all readers + # for writers, `self.current_idx` is the next block to write + # if this block is not ready to write, + # we need to wait until it is read by all readers + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if (time.monotonic() - start_time > + VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + + continue + # found a block that is either + # (1) not written + # (2) read by all readers + + # mark the block as not written + metadata_buffer[0] = 0 + # let caller write to the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has written to the buffer + # NOTE: order is important here + # first set the read flags to 0 + # then set the written flag to 1 + # otherwise, the readers may think they already read the block + for i in range(1, self.buffer.n_reader + 1): + # set read flag to 0, meaning it is not read yet + metadata_buffer[i] = 0 + # mark the block as written + metadata_buffer[0] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + break + + @contextmanager + def acquire_read(self): + assert self._is_local_reader, "Only readers can acquire read" + start_time = time.monotonic() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_flag = metadata_buffer[self.local_reader_rank + 1] + written_flag = metadata_buffer[0] + if not written_flag or read_flag: + # this block is either + # (1) not written + # (2) already read by this reader + + # for readers, `self.current_idx` is the next block to read + # if this block is not ready, + # we need to wait until it is written + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if (time.monotonic() - start_time > + VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + + continue + # found a block that is not read by this reader + # let caller read from the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has read from the buffer + # set the read flag + metadata_buffer[self.local_reader_rank + 1] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + break + + def enqueue(self, obj): + assert self._is_writer, "Only writers can enqueue" + serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL) + if self.n_local_reader > 0: + if len(serialized_obj) >= self.buffer.max_chunk_bytes: + with self.acquire_write() as buf: + buf[0] = 1 # overflow + self.local_socket.send(serialized_obj) + else: + with self.acquire_write() as buf: + buf[0] = 0 # not overflow + buf[1:len(serialized_obj) + 1] = serialized_obj + if self.n_remote_reader > 0: + self.remote_socket.send(serialized_obj) + + def dequeue(self): + if self._is_local_reader: + with self.acquire_read() as buf: + overflow = buf[0] == 1 + if not overflow: + # no need to know the size of serialized object + # pickle format contains the size information internally + # see https://docs.python.org/3/library/pickle.html + obj = pickle.loads(buf[1:]) + if overflow: + recv = self.local_socket.recv() + obj = pickle.loads(recv) + elif self._is_remote_reader: + recv = self.remote_socket.recv() + obj = pickle.loads(recv) + else: + raise RuntimeError("Only readers can dequeue") + return obj + + def broadcast_object(self, obj=None): + if self._is_writer: + self.enqueue(obj) + return obj + else: + return self.dequeue() + + @staticmethod + def create_from_process_group(pg: ProcessGroup, + max_chunk_bytes, + max_chunks, + writer_rank=0) -> "MessageQueue": + group_rank = dist.get_rank(pg) + group_world_size = dist.get_world_size(pg) + global_ranks = dist.get_process_group_ranks(pg) + + from .parallel_state import in_the_same_node_as + status = in_the_same_node_as(pg, source_rank=writer_rank) + same_node_ranks = [i for i, s in enumerate(status) if s] + n_reader = group_world_size - 1 + n_local_reader = len(same_node_ranks) - 1 + local_reader_ranks = [i for i in same_node_ranks if i != writer_rank] + buffer_io: MessageQueue + if group_rank == writer_rank: + buffer_io = MessageQueue( + n_reader=n_reader, + n_local_reader=n_local_reader, + local_reader_ranks=local_reader_ranks, + max_chunk_bytes=max_chunk_bytes, + max_chunks=max_chunks, + ) + handle = buffer_io.export_handle() + dist.broadcast_object_list([handle], + src=global_ranks[writer_rank], + group=pg) + else: + recv = [None] + dist.broadcast_object_list(recv, + src=global_ranks[writer_rank], + group=pg) + handle = recv[0] # type: ignore + buffer_io = MessageQueue.create_from_handle(handle, group_rank) + buffer_io.wait_until_ready() + return buffer_io diff --git a/aiter/dist/utils.py b/aiter/dist/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..df0b094b88e12205ad5662d317beba3cb8eb1e6e --- /dev/null +++ b/aiter/dist/utils.py @@ -0,0 +1,1617 @@ +""" + * Copyright (C) 2024-2025, The vLLM team. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +""" + +import argparse +import asyncio +import contextlib +import datetime +import enum +import gc +import inspect +import ipaddress +import os +import random +import socket +import subprocess +import sys +import tempfile +import threading +import uuid +import warnings +import weakref +from asyncio import FIRST_COMPLETED, ensure_future +from functools import lru_cache, partial, wraps +from platform import uname +from typing import ( + Any, + AsyncGenerator, + Awaitable, + Callable, + Dict, + Generic, + Hashable, + List, + Literal, + Optional, + OrderedDict, + Set, + Tuple, + Type, + TypeVar, + Union, + overload, +) +from uuid import uuid4 + +import numpy as np +import numpy.typing as npt +import psutil +import torch +import torch.types +import yaml +from packaging.version import Version +from typing_extensions import ParamSpec, TypeIs, assert_never + +from aiter import logger + + +# Exception strings for non-implemented encoder/decoder scenarios + +STR_NOT_IMPL_ENC_DEC_SWA = ( + "Sliding window attention for encoder/decoder models " + + "is not currently supported." +) + +STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = ( + "Prefix caching for encoder/decoder models " + "is not currently supported." +) + +STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL = ( + "Chunked prefill for encoder/decoder models " + "is not currently supported." +) + +STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = ( + "Models with logits_soft_cap " + "require FlashInfer backend, which is " + "currently not supported for encoder/decoder " + "models." +) + +STR_NOT_IMPL_ENC_DEC_LORA = ( + "LoRA is currently not currently " "supported with encoder/decoder " "models." +) + +STR_NOT_IMPL_ENC_DEC_PP = ( + "Pipeline parallelism is not " "currently supported with " "encoder/decoder models." +) + +STR_NOT_IMPL_ENC_DEC_MM = ( + "Multimodal is not currently " "supported with encoder/decoder " "models." +) + +STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ( + "Speculative decoding is not " "currently supported with encoder/" "decoder models." +) + +STR_NOT_IMPL_ENC_DEC_BACKEND = ( + "XFormers is the only backend " + "currently supported with encoder/" + "decoder models." +) + +STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ( + "Prompt adapters are not " "currently supported with encoder/" "decoder models." +) + +STR_NOT_IMPL_ENC_DEC_CPU = ( + "CPU is not currently supported with " "encoder/decoder models." +) + +# Efficiently import all enc/dec error strings +# rather than having to import all of the above +STR_NOT_IMPL_ENC_DEC_ERR_STRS = { + "STR_NOT_IMPL_ENC_DEC_SWA": STR_NOT_IMPL_ENC_DEC_SWA, + "STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE": STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + "STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL": STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL, + "STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP": STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP, + "STR_NOT_IMPL_ENC_DEC_LORA": STR_NOT_IMPL_ENC_DEC_LORA, + "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP, + "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, + "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, + "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, + "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER, + "STR_NOT_IMPL_ENC_DEC_CPU": STR_NOT_IMPL_ENC_DEC_CPU, +} + +# Constants related to forcing the attention backend selection + +# String name of register which may be set in order to +# force auto-selection of attention backend by Attention +# wrapper +STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND" + +# Possible string values of STR_BACKEND_ENV_VAR +# register, corresponding to possible backends +STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER" +STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA" +STR_ROCM_FLASH_ATTN_VAL: str = "ROCM_FLASH" +STR_XFORMERS_ATTN_VAL: str = "XFORMERS" +STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" +STR_INVALID_VAL: str = "INVALID" + +GB_bytes = 1_000_000_000 +"""The number of bytes in one gigabyte (GB).""" + +GiB_bytes = 1 << 30 +"""The number of bytes in one gibibyte (GiB).""" + +STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.half, + "bfloat16": torch.bfloat16, + "float": torch.float, + "fp8": torch.uint8, + "fp8_e4m3": torch.uint8, + "fp8_e5m2": torch.uint8, +} + +TORCH_DTYPE_TO_NUMPY_DTYPE = { + torch.float16: np.float16, + torch.float32: np.float32, + torch.float64: np.float64, + torch.uint8: np.uint8, + torch.int32: np.int32, + torch.int64: np.int64, +} + +P = ParamSpec("P") +K = TypeVar("K") +T = TypeVar("T") +U = TypeVar("U") + + +class _Sentinel: ... + + +ALL_PINNED_SENTINEL = _Sentinel() + + +class rpd_trace: + + def __init__(self, filename=None, name=None, nvtx=False, args=None, skip=False): + self.skip = skip + if not self.skip: + self.name = name + self.args = args if args else "" + self.rpd = self.initialize_rpd_tracer(filename, nvtx) + + def _recreate_cm(self): + return self + + def __call__(self, func): + if not self.skip: + if self.name: + self.name += f"{func.__name__}" + else: + self.name = f"{func.__qualname__}" + + @wraps(func) + def inner(*args, **kwds): + with self._recreate_cm(): + return func(*args, **kwds) + + return inner + return func + + def __enter__(self): + if not self.skip: + self.rpd.__enter__() + self.rpd.rangePush("python", f"{self.name}", f"{self.args}") + return self + + def __exit__(self, *exc): + if not self.skip: + self.rpd.rangePop() + self.rpd.__exit__(None, None, None) + return False + + @staticmethod + def setup_environment_variables(filename): + os.environ["RPDT_AUTOSTART"] = "0" + os.environ["RPDT_FILENAME"] = filename + + def initialize_rpd_tracer(self, filename, nvtx): + try: + from rpdTracerControl import rpdTracerControl + + rpd_trace.setup_environment_variables(filename) + rpdTracerControl.setFilename(name=filename, append=True) + return rpdTracerControl(nvtx=nvtx) + except Exception as e: + print(f"Error initializing rpdTracerControl: {e}") + raise + + @staticmethod + def create_file(filename): + import sqlite3 + + from rocpd.schema import RocpdSchema + + try: + print("Creating empty rpd schema file ...") + filename = str(filename) + with sqlite3.connect(filename) as connection: + schema = RocpdSchema() + schema.writeSchema(connection) + connection.commit() + except sqlite3.OperationalError as e: + print(f"SQLite operational error: {e}") + except Exception as e: + print(f"An error occurred while creating the filename: {e}") + + +@lru_cache(maxsize=None) +def is_hipScopedMarker_available(): + try: + from hipScopedMarker import hipScopedMarker + except ImportError: + hipScopedMarker = None + return hipScopedMarker is not None + + +class rpd_mark: + + def __init__(self, name=None): + self.name = name + + def __call__(self, func): + + if is_hipScopedMarker_available(): + from hipScopedMarker import hipScopedMarker + + @wraps(func) + def inner(*args, **kwds): + marker_name = self.name if self.name else f"{func.__name__}" + with hipScopedMarker(f"{marker_name}"): + return func(*args, **kwds) + + return inner + + else: + return func + + +class Device(enum.Enum): + GPU = enum.auto() + CPU = enum.auto() + + +class Counter: + + def __init__(self, start: int = 0) -> None: + self.counter = start + + def __next__(self) -> int: + i = self.counter + self.counter += 1 + return i + + def reset(self) -> None: + self.counter = 0 + + +class LRUCache(Generic[T]): + + def __init__(self, capacity: int): + self.cache: OrderedDict[Hashable, T] = OrderedDict() + self.pinned_items: Set[Hashable] = set() + self.capacity = capacity + + def __contains__(self, key: Hashable) -> bool: + return key in self.cache + + def __len__(self) -> int: + return len(self.cache) + + def __getitem__(self, key: Hashable) -> T: + value = self.cache[key] # Raise KeyError if not exists + self.cache.move_to_end(key) + return value + + def __setitem__(self, key: Hashable, value: T) -> None: + self.put(key, value) + + def __delitem__(self, key: Hashable) -> None: + self.pop(key) + + def touch(self, key: Hashable) -> None: + self.cache.move_to_end(key) + + def get(self, key: Hashable, default_value: Optional[T] = None) -> Optional[T]: + value: Optional[T] + if key in self.cache: + value = self.cache[key] + self.cache.move_to_end(key) + else: + value = default_value + return value + + def put(self, key: Hashable, value: T) -> None: + self.cache[key] = value + self.cache.move_to_end(key) + self._remove_old_if_needed() + + def pin(self, key: Hashable) -> None: + """ + Pins a key in the cache preventing it from being + evicted in the LRU order. + """ + if key not in self.cache: + raise ValueError(f"Cannot pin key: {key} not in cache.") + self.pinned_items.add(key) + + def _unpin(self, key: Hashable) -> None: + self.pinned_items.remove(key) + + def _on_remove(self, key: Hashable, value: Optional[T]): + pass + + def remove_oldest(self, remove_pinned=False): + if not self.cache: + return + + if not remove_pinned: + # pop the oldest item in the cache that is not pinned + lru_key = next( + (key for key in self.cache if key not in self.pinned_items), + ALL_PINNED_SENTINEL, + ) + if lru_key is ALL_PINNED_SENTINEL: + raise RuntimeError( + "All items are pinned, " "cannot remove oldest from the cache." + ) + else: + lru_key = next(iter(self.cache)) + self.pop(lru_key) + + def _remove_old_if_needed(self) -> None: + while len(self.cache) > self.capacity: + self.remove_oldest() + + def pop(self, key: Hashable, default_value: Optional[T] = None) -> Optional[T]: + run_on_remove = key in self.cache + value: Optional[T] = self.cache.pop(key, default_value) + # remove from pinned items + if key in self.pinned_items: + self._unpin(key) + if run_on_remove: + self._on_remove(key, value) + return value + + def clear(self): + while len(self.cache) > 0: + self.remove_oldest(remove_pinned=True) + self.cache.clear() + + +class PyObjectCache: + """Used to cache python objects to avoid object allocations + across scheduler iterations. + """ + + def __init__(self, obj_builder): + self._obj_builder = obj_builder + self._index = 0 + + self._obj_cache = [] + for _ in range(128): + self._obj_cache.append(self._obj_builder()) + + def _grow_cache(self): + # Double the size of the cache + num_objs = len(self._obj_cache) + for _ in range(num_objs): + self._obj_cache.append(self._obj_builder()) + + def get_object(self): + """Returns a pre-allocated cached object. If there is not enough + objects, then the cache size will double. + """ + if self._index >= len(self._obj_cache): + self._grow_cache() + assert self._index < len(self._obj_cache) + + obj = self._obj_cache[self._index] + self._index += 1 + + return obj + + def reset(self): + """Makes all cached-objects available for the next scheduler iteration.""" + self._index = 0 + + +def is_hip() -> bool: + return torch.version.hip is not None + + +@lru_cache(maxsize=None) +def is_cpu() -> bool: + from importlib.metadata import PackageNotFoundError, version + + try: + return "cpu" in version("vllm") + except PackageNotFoundError: + return False + + +@lru_cache(maxsize=None) +def is_openvino() -> bool: + from importlib.metadata import PackageNotFoundError, version + + try: + return "openvino" in version("vllm") + except PackageNotFoundError: + return False + + +@lru_cache(maxsize=None) +def is_neuron() -> bool: + try: + import transformers_neuronx + except ImportError: + transformers_neuronx = None + return transformers_neuronx is not None + + +@lru_cache(maxsize=None) +def is_xpu() -> bool: + from importlib.metadata import PackageNotFoundError, version + + try: + is_xpu_flag = "xpu" in version("vllm") + except PackageNotFoundError: + return False + # vllm is not build with xpu + if not is_xpu_flag: + return False + try: + import intel_extension_for_pytorch as ipex # noqa: F401 + + _import_ipex = True + except ImportError as e: + logger.warning("Import Error for IPEX: %s", e.msg) + _import_ipex = False + # ipex dependency is not ready + if not _import_ipex: + logger.warning("not found ipex lib") + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() + + +@lru_cache(maxsize=None) +def get_max_shared_memory_bytes(gpu: int = 0) -> int: + """Returns the maximum shared memory per thread block in bytes.""" + from vllm import _custom_ops as ops + + max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu) + # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py + # will fail + assert max_shared_mem > 0, "max_shared_mem can not be zero" + return int(max_shared_mem) + + +def get_cpu_memory() -> int: + """Returns the total CPU memory of the node in bytes.""" + return psutil.virtual_memory().total + + +def seed_everything(seed: int) -> None: + """ + Set the seed of each random module. + + Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20 + """ + random.seed(seed) + np.random.seed(seed) + + if current_platform.is_cuda_alike(): + torch.cuda.manual_seed_all(seed) + + if is_xpu(): + torch.xpu.manual_seed_all(seed) + + +def random_uuid() -> str: + return str(uuid.uuid4().hex) + + +@lru_cache(maxsize=None) +def get_vllm_instance_id() -> str: + """ + If the environment variable VLLM_INSTANCE_ID is set, return it. + Otherwise, return a random UUID. + Instance id represents an instance of the VLLM. All processes in the same + instance should have the same instance id. + """ + return envs.VLLM_INSTANCE_ID or f"vllm-instance-{random_uuid()}" + + +@lru_cache(maxsize=None) +def in_wsl() -> bool: + # Reference: https://github.com/microsoft/WSL/issues/4071 + return "microsoft" in " ".join(uname()).lower() + + +def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]: + """Take a blocking function, and run it on in an executor thread. + + This function prevents the blocking function from blocking the + asyncio event loop. + The code in this function needs to be thread safe. + """ + + def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: + loop = asyncio.get_event_loop() + p_func = partial(func, *args, **kwargs) + return loop.run_in_executor(executor=None, func=p_func) + + return _async_wrapper + + +async def iterate_with_cancellation( + iterator: AsyncGenerator[T, None], + is_cancelled: Callable[[], Awaitable[bool]], +) -> AsyncGenerator[T, None]: + """Convert async iterator into one that polls the provided function + at least once per second to check for client cancellation. + """ + + # Can use anext() in python >= 3.10 + awaits = [ensure_future(iterator.__anext__())] + while True: + done, pending = await asyncio.wait(awaits, timeout=1) + if await is_cancelled(): + with contextlib.suppress(BaseException): + awaits[0].cancel() + await iterator.aclose() + raise asyncio.CancelledError("client cancelled") + if done: + try: + item = await awaits[0] + awaits[0] = ensure_future(iterator.__anext__()) + yield item + except StopAsyncIteration: + # we are done + return + + +async def merge_async_iterators( + *iterators: AsyncGenerator[T, None], + is_cancelled: Optional[Callable[[], Awaitable[bool]]] = None, +) -> AsyncGenerator[Tuple[int, T], None]: + """Merge multiple asynchronous iterators into a single iterator. + + This method handle the case where some iterators finish before others. + When it yields, it yields a tuple (i, item) where i is the index of the + iterator that yields the item. + + It also optionally polls a provided function at least once per second + to check for client cancellation. + """ + + # Can use anext() in python >= 3.10 + awaits = {ensure_future(pair[1].__anext__()): pair for pair in enumerate(iterators)} + timeout = None if is_cancelled is None else 1 + try: + while awaits: + done, pending = await asyncio.wait( + awaits.keys(), return_when=FIRST_COMPLETED, timeout=timeout + ) + if is_cancelled is not None and await is_cancelled(): + raise asyncio.CancelledError("client cancelled") + for d in done: + pair = awaits.pop(d) + try: + item = await d + i, it = pair + awaits[ensure_future(it.__anext__())] = pair + yield i, item + except StopAsyncIteration: + pass + finally: + # Cancel any remaining iterators + for f, (_, it) in awaits.items(): + with contextlib.suppress(BaseException): + f.cancel() + await it.aclose() + + +async def collect_from_async_generator(iterator: AsyncGenerator[T, None]) -> List[T]: + """Collect all items from an async generator into a list.""" + items = [] + async for item in iterator: + items.append(item) + return items + + +def get_ip() -> str: + # host_ip = envs.VLLM_HOST_IP + # if host_ip: + # return host_ip + + # IP is not set, try to get it from the network interface + + # try ipv4 + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable + return s.getsockname()[0] + except Exception: + pass + + # try ipv6 + try: + s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + # Google's public DNS server, see + # https://developers.google.com/speed/public-dns/docs/using#addresses + s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable + return s.getsockname()[0] + except Exception: + pass + + warnings.warn( + "Failed to get the IP address, using 0.0.0.0 by default." + "The value can be set by the environment variable" + " VLLM_HOST_IP or HOST_IP.", + stacklevel=2, + ) + return "0.0.0.0" + + +def is_valid_ipv6_address(address: str) -> bool: + try: + ipaddress.IPv6Address(address) + return True + except ValueError: + return False + + +def get_distributed_init_method(ip: str, port: int) -> str: + # Brackets are not permitted in ipv4 addresses, + # see https://github.com/python/cpython/issues/103848 + return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}" + + +def get_open_zmq_ipc_path() -> str: + base_rpc_path = envs.VLLM_RPC_BASE_PATH + return f"ipc://{base_rpc_path}/{uuid4()}" + + +def get_open_port() -> int: + # port = envs.VLLM_PORT + port = None + if port is not None: + while True: + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", port)) + return port + except OSError: + port += 1 # Increment port number if already in use + logger.info("Port %d is already in use, trying port %d", port - 1, port) + # try ipv4 + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + except OSError: + # try ipv6 + with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def find_process_using_port(port: int) -> Optional[psutil.Process]: + for conn in psutil.net_connections(): + if conn.laddr.port == port: + try: + return psutil.Process(conn.pid) + except psutil.NoSuchProcess: + return None + return None + + +def update_environment_variables(envs: Dict[str, str]): + for k, v in envs.items(): + if k in os.environ and os.environ[k] != v: + logger.warning( + "Overwriting environment variable %s " "from '%s' to '%s'", + k, + os.environ[k], + v, + ) + os.environ[k] = v + + +def chunk_list(lst: List[T], chunk_size: int): + """Yield successive chunk_size chunks from lst.""" + for i in range(0, len(lst), chunk_size): + yield lst[i : i + chunk_size] + + +def cdiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + +def _generate_random_fp8( + tensor: torch.Tensor, + low: float, + high: float, +) -> None: + # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type, + # it may occur Inf or NaN if we directly use torch.randint + # to generate random data for fp8 data. + # For example, s.11111.00 in fp8e5m2 format represents Inf. + # | E4M3 | E5M2 + # -----|-------------|------------------- + # Inf | N/A | s.11111.00 + # NaN | s.1111.111 | s.11111.{01,10,11} + from vllm import _custom_ops as ops + + tensor_tmp = torch.empty_like(tensor, dtype=torch.float16) + tensor_tmp.uniform_(low, high) + ops.convert_fp8(tensor, tensor_tmp) + del tensor_tmp + + +def get_kv_cache_torch_dtype( + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None, +) -> torch.dtype: + if isinstance(cache_dtype, str): + if cache_dtype == "auto": + if isinstance(model_dtype, str): + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] + elif isinstance(model_dtype, torch.dtype): + torch_dtype = model_dtype + else: + raise ValueError(f"Invalid model dtype: {model_dtype}") + elif cache_dtype in ["half", "bfloat16", "float"]: + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] + elif cache_dtype == "fp8": + torch_dtype = torch.uint8 + else: + raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") + elif isinstance(cache_dtype, torch.dtype): + torch_dtype = cache_dtype + else: + raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") + return torch_dtype + + +def create_kv_caches_with_random_flash( + num_blocks: int, + block_size: int, + num_layers: int, + num_heads: int, + head_size: int, + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None, + seed: int = 0, + device: Optional[str] = "cuda", +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + seed_everything(seed) + + torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) + key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) + scale = head_size**-0.5 + + key_caches: List[torch.Tensor] = [] + value_caches: List[torch.Tensor] = [] + + for _ in range(num_layers): + key_value_cache = torch.empty( + size=key_value_cache_shape, dtype=torch_dtype, device=device + ) + if cache_dtype in ["auto", "half", "bfloat16", "float"]: + key_value_cache.uniform_(-scale, scale) + elif cache_dtype == "fp8": + _generate_random_fp8(key_value_cache, -scale, scale) + else: + raise ValueError(f"Does not support key cache of type {cache_dtype}") + key_caches.append(key_value_cache[:, 0]) + value_caches.append(key_value_cache[:, 1]) + return key_caches, value_caches + + +def create_kv_caches_with_random( + num_blocks: int, + block_size: int, + num_layers: int, + num_heads: int, + head_size: int, + cache_dtype: Optional[Union[str, torch.dtype]], + model_dtype: Optional[Union[str, torch.dtype]] = None, + seed: int = 0, + device: Optional[str] = "cuda", +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + + if cache_dtype == "fp8" and head_size % 16: + raise ValueError( + f"Does not support key cache of type fp8 with head_size {head_size}" + ) + + seed_everything(seed) + + torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) + + scale = head_size**-0.5 + x = 16 // torch.tensor([], dtype=torch_dtype).element_size() + key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) + key_caches: List[torch.Tensor] = [] + for _ in range(num_layers): + key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device) + if cache_dtype in ["auto", "half", "bfloat16", "float"]: + key_cache.uniform_(-scale, scale) + elif cache_dtype == "fp8": + _generate_random_fp8(key_cache, -scale, scale) + else: + raise ValueError(f"Does not support key cache of type {cache_dtype}") + key_caches.append(key_cache) + + value_cache_shape = (num_blocks, num_heads, head_size, block_size) + value_caches: List[torch.Tensor] = [] + for _ in range(num_layers): + value_cache = torch.empty( + size=value_cache_shape, dtype=torch_dtype, device=device + ) + if cache_dtype in ["auto", "half", "bfloat16", "float"]: + value_cache.uniform_(-scale, scale) + elif cache_dtype == "fp8": + _generate_random_fp8(value_cache, -scale, scale) + else: + raise ValueError(f"Does not support value cache of type {cache_dtype}") + value_caches.append(value_cache) + return key_caches, value_caches + + +@lru_cache +def print_warning_once(msg: str) -> None: + # Set the stacklevel to 2 to print the caller's line info + logger.warning(msg, stacklevel=2) + + +@lru_cache(maxsize=None) +def is_pin_memory_available() -> bool: + + if in_wsl(): + # Pinning memory in WSL is not supported. + print_warning_once( + "Using 'pin_memory=False' as WSL is detected. " + "This may slow down the performance." + ) + return False + elif is_xpu(): + print_warning_once("Pin memory is not supported on XPU.") + return False + elif is_neuron(): + print_warning_once("Pin memory is not supported on Neuron.") + return False + elif is_cpu() or is_openvino(): + return False + return True + + +class DeviceMemoryProfiler: + + def __init__(self, device: Optional[torch.types.Device] = None): + self.device = device + + def current_memory_usage(self) -> float: + # Return the memory usage in bytes. + if current_platform.is_cuda_alike(): + torch.cuda.reset_peak_memory_stats(self.device) + mem = torch.cuda.max_memory_allocated(self.device) + elif is_xpu(): + torch.xpu.reset_peak_memory_stats(self.device) # type: ignore + mem = torch.xpu.max_memory_allocated(self.device) # type: ignore + return mem + + def __enter__(self): + self.initial_memory = self.current_memory_usage() + # This allows us to call methods of the context manager if needed + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.final_memory = self.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory + + # Force garbage collection + gc.collect() + + +def make_ndarray_with_pad( + x: List[List[T]], + pad: T, + dtype: npt.DTypeLike, + *, + max_len: Optional[int] = None, +) -> npt.NDArray: + """ + Make a padded array from 2D inputs. + + The padding is applied to the end of each inner list until it reaches + `max_len`. + """ + if max_len is None: + # Unlike for most functions, map is faster than a genexpr over `len` + max_len = max(map(len, x), default=0) + + padded_x = np.full((len(x), max_len), pad, dtype=dtype) + for ind, blocktb in enumerate(x): + assert len(blocktb) <= max_len + padded_x[ind, : len(blocktb)] = blocktb + + return padded_x + + +def make_tensor_with_pad( + x: List[List[T]], + pad: T, + dtype: torch.dtype, + *, + max_len: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + pin_memory: bool = False, +) -> torch.Tensor: + """ + Make a padded tensor from 2D inputs. + + The padding is applied to the end of each inner list until it reaches + `max_len`. + """ + np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] + padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len) + + tensor = torch.from_numpy(padded_x).to(device) + if pin_memory: + tensor = tensor.pin_memory() + + return tensor + + +def async_tensor_h2d( + data: list, + dtype: torch.dtype, + target_device: Union[str, torch.device], + pin_memory: bool, +) -> torch.Tensor: + """Asynchronously create a tensor and copy it from host to device.""" + t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu") + return t.to(device=target_device, non_blocking=True) + + +def get_dtype_size(dtype: torch.dtype) -> int: + """Get the size of the data type in bytes.""" + return torch.tensor([], dtype=dtype).element_size() + + +# `collections` helpers +def is_list_of( + value: object, + typ: Type[T], + *, + check: Literal["first", "all"] = "first", +) -> TypeIs[List[T]]: + if not isinstance(value, list): + return False + + if check == "first": + return len(value) == 0 or isinstance(value[0], typ) + elif check == "all": + return all(isinstance(v, typ) for v in value) + + assert_never(check) + + +JSONTree = Union[ + Dict[str, "JSONTree[T]"], List["JSONTree[T]"], Tuple["JSONTree[T]", ...], T +] +"""A nested JSON structure where the leaves need not be JSON-serializable.""" + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: Dict[str, JSONTree[T]], +) -> Dict[str, JSONTree[U]]: ... + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: List[JSONTree[T]], +) -> List[JSONTree[U]]: ... + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: Tuple[JSONTree[T], ...], +) -> Tuple[JSONTree[U], ...]: ... + + +@overload +def json_map_leaves( + func: Callable[[T], U], + value: JSONTree[T], +) -> JSONTree[U]: ... + + +def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]: + if isinstance(value, dict): + return {k: json_map_leaves(func, v) for k, v in value.items()} + elif isinstance(value, list): + return [json_map_leaves(func, v) for v in value] + elif isinstance(value, tuple): + return tuple(json_map_leaves(func, v) for v in value) + else: + return func(value) + + +def flatten_2d_lists(lists: List[List[T]]) -> List[T]: + """Flatten a list of lists to a single list.""" + return [item for sublist in lists for item in sublist] + + +def init_cached_hf_modules() -> None: + """ + Lazy initialization of the Hugging Face modules. + """ + from transformers.dynamic_module_utils import init_hf_modules + + init_hf_modules() + + +@lru_cache(maxsize=None) +def find_library(lib_name: str) -> str: + """ + Find the library file in the system. + `lib_name` is full filename, with both prefix and suffix. + This function resolves `lib_name` to the full path of the library. + """ + # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard + # `/sbin/ldconfig` should exist in all Linux systems. + # `/sbin/ldconfig` searches the library in the system + libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() + # each line looks like the following: + # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1 + locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line] + # `LD_LIBRARY_PATH` searches the library in the user-defined paths + env_ld_library_path = envs.LD_LIBRARY_PATH + if not locs and env_ld_library_path: + locs = [ + os.path.join(dir, lib_name) + for dir in env_ld_library_path.split(":") + if os.path.exists(os.path.join(dir, lib_name)) + ] + if not locs: + raise ValueError(f"Cannot find {lib_name} in the system.") + return locs[0] + + +def find_nccl_library() -> str: + """ + We either use the library file specified by the `VLLM_NCCL_SO_PATH` + environment variable, or we find the library file brought by PyTorch. + After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be + found by `ctypes` automatically. + """ + so_file = envs.VLLM_NCCL_SO_PATH + + # manually load the nccl library + if so_file: + logger.info( + "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file + ) + else: + if torch.version.cuda is not None: + so_file = "libnccl.so.2" + elif torch.version.hip is not None: + so_file = "librccl.so.1" + else: + raise ValueError("NCCL only supports CUDA and ROCm backends.") + logger.info("Found nccl from library %s", so_file) + return so_file + + +def enable_trace_function_call_for_thread() -> None: + """Set up function tracing for the current thread, + if enabled via the VLLM_TRACE_FUNCTION environment variable + """ + + if envs.VLLM_TRACE_FUNCTION: + tmp_dir = tempfile.gettempdir() + filename = ( + f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" + f"_thread_{threading.get_ident()}_" + f"at_{datetime.datetime.now()}.log" + ).replace(" ", "_") + log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(), filename) + os.makedirs(os.path.dirname(log_path), exist_ok=True) + enable_trace_function_call(log_path) + + +# `functools` helpers +def identity(value: T) -> T: + return value + + +F = TypeVar("F", bound=Callable[..., Any]) + + +def deprecate_kwargs( + *kws: str, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None, +) -> Callable[[F], F]: + deprecated_kws = set(kws) + + if not callable(is_deprecated): + is_deprecated = partial(identity, is_deprecated) + + def wrapper(fn: F) -> F: + + @wraps(fn) + def inner(*args, **kwargs): + if is_deprecated(): + deprecated_kwargs = kwargs.keys() & deprecated_kws + if deprecated_kwargs: + msg = ( + f"The keyword arguments {deprecated_kwargs} are " + "deprecated and will be removed in a future update." + ) + if additional_message is not None: + msg += f" {additional_message}" + + warnings.warn( + DeprecationWarning(msg), + stacklevel=3, # The inner function takes up one level + ) + + return fn(*args, **kwargs) + + return inner # type: ignore + + return wrapper + + +@lru_cache(maxsize=2) +def get_cuda_visible_devices(return_str=False) -> List[Any]: + """Get the value of the CUDA_VISIBLE_DEVICES environment variable.""" + cuda_visible_devices = os.environ.get("HIP_VISIBLE_DEVICES", None) + if cuda_visible_devices: + device_ids_str = cuda_visible_devices.split(",") + device_ids = list(map(int, device_ids_str)) + else: + device_ids = list(range(cuda_device_count_stateless())) + device_ids_str = list(map(str, device_ids)) + update_environment_variables( + {"HIP_VISIBLE_DEVICES": (",".join(device_ids_str))} + ) + logger.info( + "HIP_VISIBLE_DEVICES is not set, " + f"using all available devices: {device_ids}" + ) + return device_ids_str if return_str else device_ids + + +@lru_cache(maxsize=8) +def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int: + # Note: cuda_visible_devices is not used, but we keep it as an argument for + # LRU Cache purposes. + + # Code below is based on + # https://github.com/pytorch/pytorch/blob/ + # c1cd946818442aca8c7f812b16d187ce1586c3bc/ + # torch/cuda/__init__.py#L831C1-L831C17 + import torch.cuda + import torch.version + + if not torch.cuda._is_compiled(): + return 0 + if is_hip(): + # ROCm uses hysmi instead of nvml for stateless device count + # This requires a sufficiently modern version of Torch 2.4.0 + raw_count = ( + torch.cuda._device_count_amdsmi() + if (hasattr(torch.cuda, "_device_count_amdsmi")) + else -1 + ) + else: + raw_count = torch.cuda._device_count_nvml() + r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count + return r + + +def cuda_device_count_stateless() -> int: + """Get number of CUDA devices, caching based on the value of + CUDA_VISIBLE_DEVICES at the time of call. + + This should be used instead of torch.cuda.device_count() + unless CUDA_VISIBLE_DEVICES has already been set to the desired + value.""" + + # This can be removed and simply replaced with torch.cuda.get_device_count + # after https://github.com/pytorch/pytorch/pull/122815 is released. + return _cuda_device_count_stateless() + + +def cuda_is_initialized() -> bool: + """Check if CUDA is initialized.""" + if not torch.cuda._is_compiled(): + return False + return torch.cuda.is_initialized() + + +def weak_bind( + bound_method: Callable[..., Any], +) -> Callable[..., None]: + """Make an instance method that weakly references + its associated instance and no-ops once that + instance is collected.""" + ref = weakref.ref(bound_method.__self__) # type: ignore[attr-defined] + unbound = bound_method.__func__ # type: ignore[attr-defined] + + def weak_bound(*args, **kwargs) -> None: + if inst := ref(): + unbound(inst, *args, **kwargs) + + return weak_bound + + +def run_once(f: Callable[P, None]) -> Callable[P, None]: + + def run_once_wrapper(*args: P.args, **kwargs: P.kwargs) -> None: + if not run_once_wrapper.ran_once_flag: # type: ignore[attr-defined] + run_once_wrapper.ran_once_flag = True # type: ignore[attr-defined] + return f(*args, **kwargs) + + run_once_wrapper.ran_once_flag = False # type: ignore[attr-defined] + return run_once_wrapper + + +class FlexibleArgumentParser(argparse.ArgumentParser): + """ArgumentParser that allows both underscore and dash in names.""" + + def parse_args(self, args=None, namespace=None): + if args is None: + args = sys.argv[1:] + + if "--config" in args: + args = FlexibleArgumentParser._pull_args_from_config(args) + + # Convert underscores to dashes and vice versa in argument names + processed_args = [] + for arg in args: + if arg.startswith("--"): + if "=" in arg: + key, value = arg.split("=", 1) + key = "--" + key[len("--") :].replace("_", "-") + processed_args.append(f"{key}={value}") + else: + processed_args.append("--" + arg[len("--") :].replace("_", "-")) + else: + processed_args.append(arg) + + return super().parse_args(processed_args, namespace) + + @staticmethod + def _pull_args_from_config(args: List[str]) -> List[str]: + """Method to pull arguments specified in the config file + into the command-line args variable. + + The arguments in config file will be inserted between + the argument list. + + example: + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + ```python + $: vllm {serve,chat,complete} "facebook/opt-12B" \ + --config config.yaml -tp 2 + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--config', 'config.yaml', + '-tp', '2' + ] + $: args = [ + "serve,chat,complete", + "facebook/opt-12B", + '--port', '12323', + '--tensor-parallel-size', '4', + '-tp', '2' + ] + ``` + + Please note how the config args are inserted after the sub command. + this way the order of priorities is maintained when these are args + parsed by super(). + """ + assert args.count("--config") <= 1, "More than one config file specified!" + + index = args.index("--config") + if index == len(args) - 1: + raise ValueError( + "No config file specified! \ + Please check your command-line arguments." + ) + + file_path = args[index + 1] + + config_args = FlexibleArgumentParser._load_config_file(file_path) + + # 0th index is for {serve,chat,complete} + # followed by model_tag (only for serve) + # followed by config args + # followed by rest of cli args. + # maintaining this order will enforce the precedence + # of cli > config > defaults + if args[0] == "serve": + if index == 1: + raise ValueError( + "No model_tag specified! Please check your command-line" + " arguments." + ) + args = ( + [args[0]] + [args[1]] + config_args + args[2:index] + args[index + 2 :] + ) + else: + args = [args[0]] + config_args + args[1:index] + args[index + 2 :] + + return args + + @staticmethod + def _load_config_file(file_path: str) -> List[str]: + """Loads a yaml file and returns the key value pairs as a + flattened list with argparse like pattern + ```yaml + port: 12323 + tensor-parallel-size: 4 + ``` + returns: + processed_args: list[str] = [ + '--port': '12323', + '--tensor-parallel-size': '4' + ] + + """ + + extension: str = file_path.split(".")[-1] + if extension not in ("yaml", "yml"): + raise ValueError( + "Config file must be of a yaml/yml type.\ + %s supplied", + extension, + ) + + # only expecting a flat dictionary of atomic types + processed_args: List[str] = [] + + config: Dict[str, Union[int, str]] = {} + try: + with open(file_path, "r") as config_file: + config = yaml.safe_load(config_file) + except Exception as ex: + logger.error( + "Unable to read the config file at %s. \ + Make sure path is correct", + file_path, + ) + raise ex + + for key, value in config.items(): + processed_args.append("--" + key) + processed_args.append(str(value)) + + return processed_args + + +async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, **kwargs): + """Utility function to run async task in a lock""" + async with lock: + return await task(*args, **kwargs) + + +def supports_kw( + callable: Callable[..., object], + kw_name: str, + requires_kw_only: bool = False, + allow_var_kwargs: bool = True, +) -> bool: + """Check if a keyword is a valid kwarg for a callable; if requires_kw_only + disallows kwargs names that can also be positional arguments. + """ + params = inspect.signature(callable).parameters + if not params: + return False + + param_val = params.get(kw_name) + + # Types where the it may be valid, i.e., explicitly defined & nonvariadic + passable_kw_types = set( + ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + ) + + if param_val: + is_sig_param = param_val.kind in passable_kw_types + # We want kwargs only, but this is passable as a positional arg + if ( + requires_kw_only + and is_sig_param + and param_val.kind != inspect.Parameter.KEYWORD_ONLY + ): + return False + if (requires_kw_only and param_val.kind == inspect.Parameter.KEYWORD_ONLY) or ( + not requires_kw_only and is_sig_param + ): + return True + + # If we're okay with var-kwargs, it's supported as long as + # the kw_name isn't something like *args, **kwargs + if allow_var_kwargs: + # Get the last param; type is ignored here because params is a proxy + # mapping, but it wraps an ordered dict, and they appear in order. + # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters + last_param = params[next(reversed(params))] # type: ignore + return ( + last_param.kind == inspect.Parameter.VAR_KEYWORD + and last_param.name != kw_name + ) + return False + + +def resolve_mm_processor_kwargs( + init_kwargs: Optional[Dict[str, Any]], + inference_kwargs: Optional[Dict[str, Any]], + callable: Callable[..., object], + allow_var_kwargs: bool = False, +) -> Dict[str, Any]: + """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., + those who are not explicit keywords to the given callable (of one is + given; otherwise no filtering is done), then merges the kwarg dicts, + giving priority to inference_kwargs if there are any collisions. + + In the case that no kwarg overrides are provided, returns an empty + dict so that it can still be kwarg expanded into the callable later on. + + If allow_var_kwargs=True, allows for things that can be expanded into + kwargs as long as they aren't naming collision for var_kwargs or potential + positional arguments. + """ + # Filter inference time multimodal processor kwargs provided + runtime_mm_kwargs = get_allowed_kwarg_only_overrides( + callable, overrides=inference_kwargs, allow_var_kwargs=allow_var_kwargs + ) + + # Filter init time multimodal processor kwargs provided + init_mm_kwargs = get_allowed_kwarg_only_overrides( + callable, overrides=init_kwargs, allow_var_kwargs=allow_var_kwargs + ) + + # Merge the final processor kwargs, prioritizing inference + # time values over the initialization time values. + mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs} + return mm_processor_kwargs + + +def get_allowed_kwarg_only_overrides( + callable: Callable[..., object], + overrides: Optional[Dict[str, Any]], + allow_var_kwargs: bool = False, +) -> Dict[str, Any]: + """ + Given a callable which has one or more keyword only params and a dict + mapping param names to values, drop values that can be not be kwarg + expanded to overwrite one or more keyword-only args. This is used in a + few places to handle custom processor overrides for multimodal models, + e.g., for profiling when processor options provided by the user + may affect the number of mm tokens per instance. + + Args: + callable: Callable which takes 0 or more keyword only arguments. + If None is provided, all overrides names are allowed. + overrides: Potential overrides to be used when invoking the callable. + allow_var_kwargs: Allows overrides that are expandable for var kwargs. + + Returns: + Dictionary containing the kwargs to be leveraged which may be used + to overwrite one or more keyword only arguments when invoking the + callable. + """ + if not overrides: + return {} + + # Drop any mm_processor_kwargs provided by the user that + # are not kwargs, unless it can fit it var_kwargs param + filtered_overrides = { + kwarg_name: val + for kwarg_name, val in overrides.items() + if supports_kw( + callable, + kwarg_name, + requires_kw_only=True, + allow_var_kwargs=allow_var_kwargs, + ) + } + + # If anything is dropped, log a warning + dropped_keys = overrides.keys() - filtered_overrides.keys() + if dropped_keys: + logger.warning( + "The following intended overrides are not keyword-only args " + "and and will be dropped: %s", + dropped_keys, + ) + + return filtered_overrides + + +# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0. +# In particular, the FakeScalarType is not supported for earlier versions of +# PyTorch which breaks dynamo for any ops registered using ScalarType. +def supports_dynamo() -> bool: + base_torch_version = Version(Version(torch.__version__).base_version) + return base_torch_version >= Version("2.4.0") + + +# Some backends use pytorch version < 2.4.0 which doesn't +# support `torch.library.custom_op`. +def supports_custom_op() -> bool: + return hasattr(torch.library, "custom_op") + + +class AtomicCounter: + """An atomic, thread-safe counter""" + + def __init__(self, initial=0): + """Initialize a new atomic counter to given initial value""" + self._value = initial + self._lock = threading.Lock() + + def inc(self, num=1): + """Atomically increment the counter by num and return the new value""" + with self._lock: + self._value += num + return self._value + + def dec(self, num=1): + """Atomically decrement the counter by num and return the new value""" + with self._lock: + self._value -= num + return self._value + + @property + def value(self): + return self._value diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..85604fd8b688a1655dcf3944fb5d5d3fea5272ba --- /dev/null +++ b/aiter/fused_moe.py @@ -0,0 +1,1006 @@ +# SPDX-License-Identifier: MIT + +import torch +import os +from typing import Optional +import functools +import aiter +from aiter import logger +from aiter import ActivationType, QuantType, dtypes + +# from aiter import get_hip_quant as get_quant +# from aiter import get_torch_quant as get_quant +from aiter import get_triton_quant as get_quant +from aiter.jit.core import AITER_ROOT_DIR, PY, get_asm_dir, bd_dir, mp_lock +from aiter.jit.utils.chip_info import get_cu_num +from aiter import pertoken_quant, ck_moe + +BLOCK_SIZE_M = 32 + + +def moe_sorting( + topk_ids, + topk_weights, + num_experts, + model_dim, + moebuf_dtype, + block_size=BLOCK_SIZE_M, + expert_mask=None, +): + device = topk_ids.device + M, topk = topk_ids.shape + max_num_tokens_padded = topk_ids.numel() + num_experts * block_size - topk + max_num_m_blocks = int((max_num_tokens_padded + block_size - 1) // block_size) + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=dtypes.i32, device=device) + sorted_weights = torch.empty( + (max_num_tokens_padded,), dtype=dtypes.fp32, device=device + ) + sorted_expert_ids = torch.empty( + (max_num_m_blocks,), dtype=dtypes.i32, device=device + ) + tokens_positions_per_expert = torch.empty( + (num_experts*2,), dtype=dtypes.i32, device=device + ) + num_valid_ids = torch.empty((1), dtype=dtypes.i32, device=device) + moe_buf = torch.empty((M, model_dim), dtype=moebuf_dtype, device=device) + + # for now, moe_sorting_fwd only support int32 topk_ids + if topk_ids.dtype != dtypes.i32: + topk_ids = topk_ids.to(dtypes.i32) + + aiter.moe_sorting_fwd( + topk_ids, + topk_weights, + sorted_ids, + sorted_weights, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + moe_buf, + num_experts, + block_size, + expert_mask, + ) + return sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf + + +@functools.lru_cache(maxsize=1024) +def get_inter_dim(w1_shape, w2_shape): + E, _, model_dim = w1_shape + E, model_dim, inter_dim = w2_shape + + int4_war = model_dim // w1_shape[-1] + inter_dim *= int4_war + return E, model_dim, inter_dim + + +def fused_moe( + hidden_states, + w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk_weight, + topk_ids, + expert_mask=None, # EP + activation=ActivationType.Silu, + quant_type=QuantType.No, + doweight_stage1=False, + # following for quant + w1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + w2_scale=None, # [expert(local_expert:EP), model_dim, 1] + a1_scale=None, # [expert(local_expert:EP), 1, model_dim] + a2_scale=None, # [expert(local_expert:EP), 1, inter_dim] + # following for tuning + block_size_M=None, +): + """user API""" + M, topk = topk_ids.shape + E, model_dim, inter_dim = get_inter_dim(w1.shape, w2.shape) + + assert w1.shape[1] in [ + inter_dim, + inter_dim * 2, + ], f"Invalid MoE weight: {w1.shape=} {w2.shape=}" + isG1U1 = inter_dim != w1.shape[1] + + global_E = E + if expert_mask is not None: + global_E = expert_mask.numel() + dtype = hidden_states.dtype + q_dtype_w = w1.dtype + q_dtype_a = w1.dtype if w1.dtype != torch.uint32 else dtypes.fp8 + + #暂时关闭block_size_M 与 2stage 相关的pass + + # if block_size_M is None: + # _, _, block_size_M, *_ = get_2stage_cfgs( + # M, + # model_dim, + # inter_dim, + # E, + # topk, + # dtype, + # q_dtype_a, + # q_dtype_w, + # quant_type, + # isG1U1, + # activation, + # doweight_stage1, + # ) + # run_1stage = M < 256 + run_1stage = True + #暂不支持blockwise quant + # run_1stage = quant_type == QuantType.per_128x128 + block_size_M = 32 if run_1stage else block_size_M + return aiter.ck_moe( + hidden_states = hidden_states, + w1 = w1, + w2 = w2, + topk_weight = topk_weight, + topk_ids = topk_ids, + w1_scale = w1_scale, + w2_scale = w2_scale, + fc1_smooth_scale = a1_scale, + fc2_smooth_scale = a2_scale, + ).to(dtypes.fp16) + + # sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, moe_buf = moe_sorting( + # topk_ids, topk_weight, global_E, model_dim, dtype, block_size_M, expert_mask + # ) + + # if run_1stage: + # assert ( + # doweight_stage1 == False + # ), "doweight_stage1 not support in fused_moe_1stage" + # return fused_moe_1stage( + # hidden_states, + # w1, + # w2, + # topk, + # sorted_ids, + # sorted_weights, + # sorted_expert_ids, + # num_valid_ids, + # moe_buf, + # isG1U1, + # block_size_M, + # activation=activation, + # quant_type=quant_type, + # q_dtype_a=q_dtype_a, + # q_dtype_w=q_dtype_w, + # w1_scale=w1_scale, + # w2_scale=w2_scale, + # a1_scale=a1_scale, + # a2_scale=a2_scale, + # ) + # else: + # return fused_moe_2stages( + # hidden_states, + # w1, + # w2, + # topk, + # sorted_ids, + # sorted_weights, + # sorted_expert_ids, + # num_valid_ids, + # moe_buf, + # isG1U1, + # block_size_M, + # activation=activation, + # quant_type=quant_type, + # doweight_stage1=doweight_stage1, + # q_dtype_a=q_dtype_a, + # q_dtype_w=q_dtype_w, + # w1_scale=w1_scale, + # w2_scale=w2_scale, + # a1_scale=a1_scale, + # a2_scale=a2_scale, + # ) + + +def fused_moe_1stage( + hidden_states, + w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + moe_buf, + isG1U1, + block_size_M=32, + activation=ActivationType.Silu, + quant_type=QuantType.No, + # following for quant + q_dtype_a=None, + q_dtype_w=None, + w1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + w2_scale=None, # [expert(local_expert:EP), model_dim, 1] + a1_scale=None, # [expert(local_expert:EP), 1, model_dim] + a2_scale=None, # [expert(local_expert:EP), 1, inter_dim] +): + if quant_type == QuantType.No and ActivationType.Silu and not isG1U1: + # pure bf16 + aiter.ck_moe( + hidden_states, + w1, + w2, + topk_weight, + + ) + aiter.fmoe( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + ) + + else: + assert False, "quant pass not support!" + # quant_type = ( + # QuantType.per_1x128 if quant_type == QuantType.per_128x128 else quant_type + # ) + # quant_func = get_quant(quant_type) + # a1, a1_scale = quant_func(hidden_states, scale=a1_scale, quant_dtype=q_dtype_a) + # if quant_type == QuantType.per_1x128: + # a1 = a1.view_as(hidden_states) + # a1_scale = a1_scale.view(hidden_states.shape[0], -1).t().contiguous() + # fmoe_func = functools.partial( + # aiter.fmoe_fp8_blockscale_g1u1, + # fc_scale_blkn=128, + # fc_scale_blkk=128, + # ) + # elif isG1U1: + # fmoe_func = aiter.fmoe_g1u1 + # else: + # fmoe_func = aiter.fmoe_int8_g1u0 + + # fmoe_func( + # moe_buf, + # a1, + # w1, + # w2, + # sorted_ids, + # sorted_weights, + # sorted_expert_ids, + # num_valid_ids, + # topk, + # a1_scale, + # w1_scale, + # w2_scale, + # fc2_smooth_scale=None, + # activation=activation, + # ) + return moe_buf + + +@functools.lru_cache(maxsize=1024) +def get_block_size_M(token, topk, expert, inter_dim): + cu_num = get_cu_num() + tileN = 128 + tgN = (inter_dim + tileN - 1) // tileN + support_list = [32, 64, 128] + + tmp = [] + for el in support_list: + max_num_tokens = token * topk + expert * el - topk + tg_num = tgN * (max_num_tokens + el - 1) // el + rnd = (tg_num + cu_num - 1) // cu_num + empty = cu_num - tg_num % cu_num + tmp.append((rnd, empty, el)) + return sorted(tmp, key=lambda x: x[:2])[0][-1] + + +cfg_2stages = None + + +@functools.lru_cache(maxsize=1024) +def get_2stage_cfgs( + token, + model_dim, + inter_dim, + expert, + topk, + dtype, + q_dtype_a, + q_dtype_w, + q_type, + use_g1u1, + activation, + doweight_stage1, +): + def get_cfg_2stages(tune_file): + import pandas as pd + + cfg_2stages = pd.read_csv(tune_file) + cfg_2stages = cfg_2stages.set_index( + [ + "token", + "model_dim", + "inter_dim", + "expert", + "topk", + "act_type", + "dtype", + "q_dtype_a", + "q_dtype_w", + "q_type", + "use_g1u1", + "doweight_stage1", + ] + ).to_dict("index") + return cfg_2stages + + global cfg_2stages + config_path = f"{AITER_ROOT_DIR}/aiter/configs/" + tune_file = os.path.join(config_path, "tuned_fmoe.csv") + untune_file = os.path.join(config_path, "untuned_fmoe.csv") + profile_file = os.path.join(config_path, "profile_fmoe.csv") + if cfg_2stages is None: + cfg_2stages = get_cfg_2stages(tune_file) + keys = ( + token, + model_dim, + inter_dim, + expert, + topk, + str(activation), + str(dtype), + str(q_dtype_a), + str(q_dtype_w), + str(q_type), + use_g1u1, + doweight_stage1, + ) + + def MainFunc(): + with open(untune_file, "a") as f: + q_dtype_ws = q_dtype_w if q_dtype_w != torch.uint32 else "torch.int4" + f.write( + f"\n{token},{model_dim},{inter_dim},{expert},{topk},{activation},{dtype},{q_dtype_a},{q_dtype_ws},{q_type},{int(use_g1u1)},{int(doweight_stage1)}" + ) + logger.info("\033[34m Start tuning fmoe") + os.system( + f"{PY} {get_asm_dir()}/fmoe_2stages/tune.py -i {untune_file} -o {tune_file} -o2 {profile_file} --last" + ) + + def FinalFunc(): + logger.info("\033[0m") + + cfg = cfg_2stages.get(keys, None) + if cfg is None and os.environ.get("AITER_ONLINE_TUNE", "0") == "1": + lock_path = os.path.join(bd_dir, f"lock_fmoe_tune_{keys}") + mp_lock(lock_path, MainFunc=MainFunc, FinalFunc=FinalFunc) + cfg_2stages = get_cfg_2stages(tune_file) + cfg = cfg_2stages.get(keys, None) + if cfg is None: + logger.warning(f"Fmoe tuning not support for {keys}") + + if cfg is None: + block_m = get_block_size_M(token, topk, expert, inter_dim) + ksplit = 0 + tag = "" + else: + block_m = cfg["block_m"] + ksplit = cfg["ksplit"] + tag = cfg["tag"] + + # war + if q_dtype_w in [dtypes.bf16, dtypes.fp16, torch.uint32]: + tag = "ck" + + logger.info(f"[fused_moe] using {'default' if cfg is None else tag} for {keys} ") + + if "ck" in tag: + return ( + functools.partial( + ck_stage1, + activation=activation, + ), + aiter.ck_moe_stage2, + block_m, + ksplit, + ) + + # TODO: remove when stage2 support more size + tmpList = [32, 64, 128] + if block_m not in tmpList: + tag = "" + block_m = ([el for el in tmpList if block_m < el] + [128])[0] + + return ( + functools.partial( + asm_stage1, + kernelName=tag, + activation=activation, + quant_type=q_type, + ), + aiter.ck_moe_stage2, + block_m, + ksplit, + ) + + +@functools.lru_cache() +def get1tensor(device): + return torch.tensor(1.0, dtype=torch.float, device=device) + + +# def fused_moe_2stages( +# hidden_states, +# w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K +# w2, # [expert(local_expert:EP), dim, inter_dim] +# topk, +# sorted_ids, +# sorted_weights, +# sorted_expert_ids, +# num_valid_ids, +# moe_out, +# isG1U1, +# block_size_M, +# activation=ActivationType.Silu, +# quant_type=QuantType.No, +# doweight_stage1=False, +# # following for quant +# q_dtype_a=None, +# q_dtype_w=None, +# w1_scale=None, # [expert(local_expert:EP), inter_dim, 1] +# w2_scale=None, # [expert(local_expert:EP), model_dim, 1] +# a1_scale=None, # [expert(local_expert:EP), 1, model_dim] +# a2_scale=None, # [expert(local_expert:EP), 1, inter_dim] +# ): + +# quant_func = get_quant(quant_type) + +# token_num, _ = hidden_states.shape +# E, model_dim, inter_dim = get_inter_dim(w1.shape, w2.shape) +# dtype = hidden_states.dtype +# device = hidden_states.device + +# stage1, stage2, block_m, ksplit = get_2stage_cfgs( +# token_num, +# model_dim, +# inter_dim, +# E, +# topk, +# dtype, +# q_dtype_a, +# q_dtype_w, +# quant_type, +# isG1U1, +# activation, +# doweight_stage1, +# ) + +# a1, a1_scale = quant_func(hidden_states, scale=a1_scale, quant_dtype=q_dtype_a) +# if quant_type != QuantType.per_128x128: +# a2 = torch.empty( +# (token_num, topk, inter_dim), +# dtype=dtype, +# device=device, +# ) +# else: +# ratio = a1_scale.element_size() // a1.element_size() +# a2 = torch.empty( +# (token_num + (token_num * ratio + 127) // 128, topk, inter_dim), +# dtype=q_dtype_a, +# device=device, +# ) + +# a2 = stage1( +# a1, +# w1, +# w2, +# sorted_ids, +# sorted_expert_ids, +# num_valid_ids, +# a2, +# block_m=block_m, +# a1_scale=a1_scale, +# w1_scale=w1_scale, +# sorted_weights=sorted_weights if doweight_stage1 else None, +# ) + +# if quant_type != QuantType.per_128x128: +# if quant_type == QuantType.per_Token: +# a2 = a2.view(token_num, -1) +# a2, a2_scale = quant_func(a2, scale=a2_scale, quant_dtype=q_dtype_a) +# a2 = a2.view(token_num, topk, inter_dim) +# else: +# a2_v = a2[:token_num, :, :] +# a2_scale = ( +# a2[token_num:, ...] +# .view(-1)[: token_num * topk * inter_dim * ratio // 128] +# .view(dtypes.fp32) +# .view(token_num, -1) +# ) +# a2 = a2_v + +# if quant_type == aiter.QuantType.No: +# a2_scale = get1tensor(device) + +# stage2( +# a2, +# w1, +# w2, +# sorted_ids, +# sorted_expert_ids, +# num_valid_ids, +# moe_out, +# topk, +# w2_scale, +# a2_scale, +# block_size_M, +# sorted_weights=sorted_weights if not doweight_stage1 else None, +# ) + +# return moe_out + + +def torch_moe_act(act_input, torch_act, inter_dim): + if act_input.shape[-1] == inter_dim: + return torch_act(act_input) + else: + gate, up = act_input.split([inter_dim, inter_dim], dim=-1) + return torch_act(gate) * up + + +# def asm_stage1( +# input, +# w1, +# w2, +# sorted_ids, +# sorted_expert_ids, +# num_valid_ids, +# out, # [token_num, topk, inter_dim] +# block_m: int, +# kernelName: str = "", +# ksplit: int = 0, +# activation=ActivationType.Silu, +# quant_type=QuantType.No, +# a1_scale=None, +# w1_scale=None, +# sorted_weights=None, +# ): +# dtype = dtypes.bf16 # out.dtype, asm only support bf16 +# if quant_type != QuantType.per_128x128: +# out = out.view(dtype) +# device = out.device +# token_num, topk, _ = out.shape +# E, model_dim, inter_dim = get_inter_dim(w1.shape, w2.shape) + +# if quant_type == QuantType.per_Tensor: +# a1_scale = a1_scale.view(1, 1).repeat(token_num, 1) +# w1_scale = w1_scale.view(E, 1).repeat(1, w1.shape[1]) +# quant_type = QuantType.per_Token + +# tmp_out = out +# if ksplit > 0: +# tmp_out = torch.zeros( +# (token_num, topk, w1.shape[1]), +# dtype=dtypes.fp32, +# device=device, +# ).view(dtype) + +# aiter.moe_stage1_g1u1( +# input, +# w1, +# w2, +# sorted_ids, +# sorted_expert_ids, +# num_valid_ids, +# tmp_out, +# inter_dim, +# kernelName, +# block_m, +# ksplit=ksplit, +# activation=activation, +# quant_type=quant_type, +# a1_scale=a1_scale, +# w1_scale=w1_scale, +# sorted_weights=sorted_weights, +# ) +# if ksplit > 0: +# if activation == ActivationType.Silu: +# aiter.silu_and_mul(out, tmp_out.view(dtypes.fp32).to(dtype)) +# else: +# aiter.gelu_and_mul(out, tmp_out.view(dtypes.fp32).to(dtype)) +# return out + + +# def ck_stage1( +# input, # [token, model_dim] +# w1, # [E, inter_dim*2, model_dim] +# w2, # [E, model_dim, inter_dim] +# sorted_ids, # [max_num_tokens_padded] +# sorted_expert_ids, # [max_num_m_blocks] +# num_valid_ids, # [1] +# out, # [token_num, topk, inter_dim] +# block_m=32, +# activation=ActivationType.Silu, +# a1_scale=None, +# w1_scale=None, +# sorted_weights=None, +# ): +# _, topk, _ = out.shape +# # max_num_tokens_padded = sorted_expert_ids.shape[0]*block_size + +# if activation == ActivationType.Silu: +# act_op = 1 +# else: +# act_op = 0 + +# aiter.ck_moe_stage1( +# input, +# w1, +# w2, +# sorted_ids, +# sorted_expert_ids, +# num_valid_ids, +# out, +# topk, +# w1_scale, +# a1_scale, +# block_m, +# sorted_weights, +# act_op, +# ) + +# return out + + +def torch_moe( + hidden_states, + w1, + w2, + topk_weight, + topk_ids, + # following for int8 quant + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] + fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] + expert_mask=None, + activation=ActivationType.Silu, +): + computeType = dtypes.fp32 + dtype = hidden_states.dtype + torch_act = aiter.get_torch_act(activation) + hidden_states = hidden_states.to(computeType) + w1 = w1.to(computeType) + w2 = w2.to(computeType) + B, D = hidden_states.shape + topk = topk_weight.shape[1] + if expert_mask is not None: + local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) - 1 + local_expert_hash[expert_mask == 0] = -1 + topk_ids = local_expert_hash[topk_ids] + + hidden_states = hidden_states.view(B, -1, D).repeat(1, topk, 1) + out = torch.zeros( + (B, topk, D), + dtype=computeType, + device=hidden_states.device, + ) + + inter_dim = w2.shape[2] + + if fc1_scale is not None: + # gose to quant D_w8a8/w8a8 + expert = w1.shape[0] + w2D = w2.shape[-1] + w1 = (w1.view(-1, D) * fc1_scale.view(-1, 1)).view(expert, -1, D) + w2 = (w2.view(-1, w2D) * fc2_scale.view(-1, 1)).view(expert, -1, w2D) + + if fc1_smooth_scale is not None: + expert = fc1_smooth_scale.shape[0] + fc1_smooth_scale = fc1_smooth_scale.view(expert, -1) + fc2_smooth_scale = fc2_smooth_scale.view(expert, -1) + + for E_id in range(w1.shape[0]): + mask = topk_ids == E_id + if mask.sum(): + sub_tokens = hidden_states[mask] + if fc1_smooth_scale is not None: + sub_tokens = sub_tokens * (fc1_smooth_scale[E_id]) + + act_input = sub_tokens @ (w1[E_id].transpose(0, 1)) + act_out = torch_moe_act(act_input, torch_act, inter_dim) + if fc2_smooth_scale is not None: + act_out = act_out * (fc2_smooth_scale[E_id]) + out[mask] = act_out @ (w2[E_id].transpose(0, 1)) + + return (out * topk_weight.view(B, -1, 1)).sum(dim=1).to(dtype) + +# For test +def torch_moe_blockscale( + hidden_states, + w1, # [expert, inter_dim*2, model_dim] + w2, # [expert, model_dim, inter_dim] + topk_weight, + topk_ids, + dtype, + # following for quant + scale_blks=(128, 128), + a_scale=None, + # [expert, inter_dim/blk_m, model_dim/blk_k] + fc1_scale=None, + # [expert, model_dim/blk_m, inter_dim/blk_k] + fc2_scale=None, + expert_mask=None, +): + computeType = dtypes.fp32 + hidden_states = hidden_states.to(computeType) + w1 = w1.to(computeType) + w2 = w2.to(computeType) + token_num, topk = topk_ids.shape + expert, model_dim, inter_dim = w2.shape + B, D = hidden_states.shape + topk = topk_weight.shape[1] + if expert_mask is not None: + local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) - 1 + local_expert_hash[expert_mask == 0] = -1 + topk_ids = local_expert_hash[topk_ids] + + blk_n, blk_k = scale_blks + if a_scale is not None: + # print(f'{a_scale.unsqueeze(-1).shape=}, {hidden_states.view(token_num, -1, blk_k).shape=}') + hidden_states = hidden_states.view(token_num, -1, blk_k) * a_scale.unsqueeze(-1) + hidden_states = hidden_states.view(token_num, -1) + + hidden_states = hidden_states.view(token_num, 1, model_dim).repeat(1, topk, 1) + out = torch.zeros( + (B, topk, D), + dtype=computeType, + device=hidden_states.device, + ) + if w2.shape[2] * 2 == w1.shape[1]: + moeType = "g1u1" + else: + moeType = "g1u0" + + nblk_n = inter_dim // blk_n + nblk_k = model_dim // blk_k + if fc1_scale is not None: + # gose to quant D_w8a8/w8a8 + # blk_n, blk_k = scale_blks + # expert, nblk_n, nblk_k = fc1_scale.shape + fc1_scale = rearrange( + fc1_scale.view(-1, 1) + .repeat(1, blk_n * blk_k) + .view(expert, -1, nblk_k, blk_n, blk_k), + "e num_blk_n num_blk_k blk_n blk_k -> e (num_blk_n blk_n) (num_blk_k blk_k)", + ) + fc2_scale = rearrange( + fc2_scale.view(-1, 1) + .repeat(1, blk_n * blk_k) + .view(expert, nblk_k, nblk_n, blk_k, blk_n), + "e num_blk_n num_blk_k blk_n blk_k -> e (num_blk_n blk_n) (num_blk_k blk_k)", + ) + w1 = w1 * fc1_scale + w2 = w2 * fc2_scale + + for E_id in range(w1.shape[0]): + mask = topk_ids == E_id + if mask.sum(): + sub_tokens = hidden_states[mask] + act_input = sub_tokens @ (w1[E_id].transpose(0, 1)) + if moeType == "g1u1": + gate, up = act_input.split([inter_dim, inter_dim], dim=-1) + act_out = F.silu(gate) * up + else: + act_out = F.gelu(act_input) + out[mask] = act_out @ (w2[E_id].transpose(0, 1)) + + return (out * topk_weight.view(B, -1, 1)).sum(dim=1).to(dtype) + +def torch_moe_stage1( + hidden_states, + w1, # E, inter_dim*2, model_dim + w2, # E, model_dim, inter_dim + topk_weight, + topk_ids, + dtype=dtypes.fp16, + activation=ActivationType.Silu, + quant_type=QuantType.No, + # following for quant + a1_scale=None, # [token, 1] + w1_scale=None, # [expert, inter_dim, 1] + doweight=False, + group_by_expert=False, +): + ctype = dtypes.fp32 # compute type + hidden_states = hidden_states.to(ctype) + w1 = w1.to(ctype) + + B, D = hidden_states.shape + topk = topk_weight.shape[1] + N = w1.shape[1] + E, model_dim, inter_dim = get_inter_dim(w1.shape, w2.shape) + + if quant_type in [QuantType.per_Token, QuantType.per_Tensor]: + w1 = w1 * w1_scale.view(w1_scale.shape[0], -1, 1) + hidden_states = hidden_states * a1_scale + # per_128x128 + elif quant_type == QuantType.per_128x128: + w1_shape = w1.shape + w1 = w1.view( + w1.shape[0], w1.shape[1] // 128, 128, w1.shape[2] // 128, 128 + ) * w1_scale.view( + w1_scale.shape[0], w1.shape[1] // 128, 1, w1.shape[2] // 128, 1 + ) + w1 = w1.view(w1_shape) + + a1_scale = a1_scale.view(hidden_states.shape[0], -1, 1) + a1_scale = a1_scale.repeat( + 1, 1, hidden_states.shape[-1] // a1_scale.shape[1] + ).view(hidden_states.shape[0], -1) + hidden_states = hidden_states * a1_scale + elif quant_type == QuantType.No: + pass + else: + assert False, f"Unsupported quant_type: {quant_type}" + + hidden_states = hidden_states.view(B, -1, D).repeat(1, topk, 1) + + out = torch.zeros( + (B, topk, N), + dtype=ctype, + device=hidden_states.device, + ) + for E_id in range(w1.shape[0]): + mask = topk_ids == E_id + if mask.sum(): + sub_tokens = hidden_states[mask] + act_input = sub_tokens @ (w1[E_id].transpose(0, 1)) + if doweight: + act_input = act_input * topk_weight[mask].view(-1, 1) + out[mask] = act_input + + + if group_by_expert: + out_flat = out.reshape(-1, out.shape[-1]) + expert_indices = [] + for expert_id in range(E): + positions = torch.nonzero(topk_ids == expert_id, as_tuple=False) + if positions.numel() == 0: + continue + linear_idx = positions[:, 0] * topk + positions[:, 1] + expert_indices.append(linear_idx) + + if expert_indices: + gather_idx = torch.cat(expert_indices).to( + device=out_flat.device, dtype=torch.long + ) + out_grouped = out_flat.index_select(0, gather_idx) + else: + out_grouped = out_flat[:0] + + out = out_grouped + + use_g1u1 = w1.shape[1] == (2 * inter_dim) + torch_act = aiter.get_torch_act(activation) + if use_g1u1: + gate, up = out.split([inter_dim, inter_dim], dim=-1) + out = torch_act(gate) * up + else: + out = torch_act(out) + return out.to(dtype) + + +def torch_moe_stage2( + hidden_states, + w1, # E, inter_dim*2, model_dim + w2, # E, model_dim, inter_dim + topk_weights, + topk_ids, + dtype=dtypes.fp16, + quant_type=QuantType.No, + w2_scale=None, # [1] + a2_scale=None, # [expert]]' + doweight=True, +): + ctype = dtypes.fp32 # compute type + hidden_states = hidden_states.to(ctype) + w2 = w2.to(ctype) + + token_num, topk = topk_ids.shape + num_experts, model_dim, inter_dim = w2.shape + hidden_states = hidden_states.view(token_num, topk, inter_dim) + + if quant_type in [QuantType.per_Token, QuantType.per_Tensor]: + w2 = w2 * w2_scale.view(w2_scale.shape[0], -1, 1) + # per_128x128 + elif quant_type == QuantType.per_128x128: + w2_shape = w2.shape + w2 = w2.view( + w2.shape[0], w2.shape[1] // 128, 128, w2.shape[2] // 128, 128 + ) * w2_scale.view( + w2_scale.shape[0], w2.shape[1] // 128, 1, w2.shape[2] // 128, 1 + ) + w2 = w2.view(w2_shape) + + if quant_type in [QuantType.per_Token, QuantType.per_Tensor]: + hidden_states = hidden_states * a2_scale.view(a2_scale.shape[0], -1, 1) + elif quant_type == QuantType.per_128x128: + a2_scale = a2_scale.view(hidden_states.shape[0], topk, -1, 1) + a2_scale = a2_scale.repeat(1, 1, 1, 128).view(hidden_states.shape[0], topk, -1) + hidden_states = hidden_states * a2_scale + + out = torch.zeros( + (token_num, topk, model_dim), + dtype=ctype, + device=hidden_states.device, + ) + for E_id in range(w1.shape[0]): + mask = topk_ids == E_id + if mask.sum(): + sub_tokens = hidden_states[mask] + act_input = sub_tokens @ (w2[E_id].transpose(0, 1)) + out[mask] = act_input + if doweight: + out = out * topk_weights.view(token_num, -1, 1) + return out.sum(1).to(dtype) + + +def fused_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + topk_ids: Optional[torch.Tensor] = None, + topk_weights: Optional[torch.Tensor] = None, + is_softmax: bool = True, +): + assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch" + + M, _ = hidden_states.shape + + if topk_weights is None: + topk_weights = torch.empty( + M, topk, dtype=dtypes.fp32, device=hidden_states.device + ) + if topk_ids is None: + topk_ids = torch.empty(M, topk, dtype=dtypes.i32, device=hidden_states.device) + + if is_softmax: + token_expert_indicies = torch.empty( + M, topk, dtype=dtypes.i32, device=hidden_states.device + ) + + aiter.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + renormalize, + ) + del token_expert_indicies # Not used. Will be used in the future. + + # sigmoid + else: + # Use sigmoid and simple topk + # scores = gating_output.sigmoid() + # topk_weights_temp, topk_ids_temp = torch.topk(scores, k=topk, dim=-1) + # topk_weights.copy_(topk_weights_temp) + # topk_ids.copy_(topk_ids_temp.to(dtypes.i32)) + # if renormalize: + # topk_weights.div_(topk_weights.sum(dim=-1, keepdim=True)) + + aiter.grouped_topk( + gating_output, + topk_weights, + topk_ids, + 1, + 1, + renormalize, + False, # is_softmax=False, go sigmoid + routed_scaling_factor = 1.0 + ) + + return topk_weights, topk_ids diff --git a/aiter/fused_moe_asm.py b/aiter/fused_moe_asm.py new file mode 100644 index 0000000000000000000000000000000000000000..af73d1a8ba1e1e3112660e21af0ddbad7106fb3b --- /dev/null +++ b/aiter/fused_moe_asm.py @@ -0,0 +1,779 @@ +# SPDX-License-Identifier: MIT + +import torch +import torch.nn.functional as F +import ctypes +from typing import Optional +import aiter +from aiter.ops.triton.fused_moe import triton_moe_sum +from aiter import logger +from aiter import pertoken_quant, get_hip_quant +from aiter import ActivationType, QuantType, dtypes + +BLOCK_SIZE_M = 32 + + +def moe_sorting_ck( + topk_ids, + topk_weights, + num_experts, + model_dim, + moebuf_dtype, + block_size=BLOCK_SIZE_M, + expert_mask=None, +): + device = topk_ids.device + M, topk = topk_ids.shape + topk = topk_ids.shape[1] + max_num_tokens_padded = topk_ids.numel() + num_experts * block_size - topk + max_num_m_blocks = int((max_num_tokens_padded + block_size - 1) // block_size) + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=dtypes.i32, device=device) + sorted_weights = torch.empty( + (max_num_tokens_padded,), dtype=dtypes.fp32, device=device + ) + sorted_expert_ids = torch.empty( + (max_num_m_blocks,), dtype=dtypes.i32, device=device + ) + tokens_positions_per_expert = torch.empty( + (num_experts*2,), dtype=dtypes.i32, device=device + ) + num_valid_ids = torch.empty((1), dtype=dtypes.i32, device=device) + moe_buf = torch.empty((M, model_dim), dtype=moebuf_dtype, device=device) + + # for now, moe_sorting_fwd only support int32 topk_ids + if topk_ids.dtype != dtypes.i32: + topk_ids = topk_ids.to(dtypes.i32) + + aiter.moe_sorting_fwd( + topk_ids, + topk_weights, + sorted_ids, + sorted_weights, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + moe_buf, + num_experts, + block_size, + expert_mask, + ) + return sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf + + +def fused_moe( + hidden_states: torch.Tensor, + d_w1_out: torch.Tensor, + d_silu: torch.Tensor, + d_w2_out: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + top_k: int, + num_valid_ids: torch.Tensor, + sorted_ids: torch.Tensor, + sorted_weight: torch.Tensor, + sorted_expert_ids: torch.Tensor, + block_size: int, +): + + aiter.asm_fmoe_stage1(d_w1_out, hidden_states, w1, w2, sorted_ids, sorted_weight, sorted_expert_ids, num_valid_ids, top_k, block_size=block_size) + + aiter.silu_and_mul(d_silu, d_w1_out) + + aiter.asm_fmoe_stage2(d_w2_out, d_silu, w1, w2, sorted_ids, sorted_weight, sorted_expert_ids, num_valid_ids, top_k, block_size=block_size) + + +def asm_moe( + hidden_states, + w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk_weight, + topk_ids, + # following for int8 quant + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] + fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] + a16=False, + per_tensor_quant_scale=None, + block_shape=None, + expert_mask=None, + activation=ActivationType.Silu, +): + E, model_dim, inter_dim = w2.shape + global_E = E + if expert_mask is not None: + global_E = expert_mask.numel() + M, topk = topk_ids.shape + dtype = hidden_states.dtype + device = topk_ids.device + lastdim_mul = 8 if w1.dtype in {dtypes.i32, torch.uint32} else 1 + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck( + topk_ids, topk_weight, global_E, model_dim, dtype, BLOCK_SIZE_M, expert_mask + ) + ) + + #sorted_ids, sorted_expert_ids, num_valid_ids = ( + # moe_align_block_size(topk_ids, 32, global_E, expert_mask)) + kernel_split = True + if kernel_split: + + #chosen_experts = num_valid_ids[0].item() // BLOCK_SIZE_M + #ptr = ctypes.cast(num_valid_ids.data_ptr(), ctypes.POINTER(ctypes.c_int32)) + #custom_stream = torch.cuda.current_stream() + #custom_stream.synchronize() + #chosen_experts = ptr[0] // BLOCK_SIZE_M + #print("chosen_experts",chosen_experts) + #print("topk",topk) + #tokens_ids = torch.empty((chosen_experts), dtype=torch.int32, device=device) + + moe_buf = torch.empty((hidden_states.size(0), w2.size(1)), dtype=torch.float16, device="cuda") + + d_w1_out = torch.empty((hidden_states.size(0) * topk, w1.size(1)), dtype=torch.float16, device="cuda") + d_silu = torch.empty((hidden_states.size(0) * topk, w2.size(2)), dtype=torch.float16, device="cuda") + d_w2_out = torch.empty((hidden_states.size(0), topk, w2.size(1)), dtype=torch.float16, device="cuda") + + fused_moe(hidden_states, + d_w1_out, + d_silu, + d_w2_out, + w1, + w2, + topk, + num_valid_ids, + sorted_ids, + sorted_weights, + sorted_expert_ids, + BLOCK_SIZE_M) + + #aiter.asm_moe_sum(d_w2_out, moe_buf, sorted_ids) + triton_moe_sum(d_w2_out, moe_buf) + +# for i in range(0,ChosenExperts[0]): +# chosenTokenIdx = sorted_ids[i*32:i*32+32] & 0xffffff #从 i*32 开始取 32 个数 +# mask = (chosenTokenIdx < hidden_states.size(0)) +# chosenTokenIdx = chosenTokenIdx[mask] #把 < 32的值组成新的tensor +# +# moe_buf[chosenTokenIdx, :] = moe_buf[chosenTokenIdx, :] + d_w2_out[i,:chosenTokenIdx.size(0)] + + elif fc1_scale is None: + # pure bf16 + aiter.fmoe( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + ) + elif a16: + # a16w8 smooth quant fmoe + if w1.dtype in [dtypes.fp8, dtypes.i8] and inter_dim * 2 == w1.shape[1]: + aiter.fmoe_g1u1_a16( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + fc1_scale, + fc2_scale, + fc1_smooth_scale, + fc2_smooth_scale, + ) + elif w1.dtype == dtypes.i8 and inter_dim == w1.shape[1]: + aiter.fmoe_int8_g1u0_a16( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + fc1_scale, + fc2_scale, + fc1_smooth_scale, + fc2_smooth_scale, + ) + else: + raise ValueError(f"Invalid args: {w1.dtype} {w1.shape=} {w2.shape=}") + elif block_shape is not None: + assert ( + dtype == torch.bfloat16 + ), "asm_moe for block_scale only support bfloat16 hidden_states" + assert block_shape == ( + 128, + 128, + ), "asm_moe for block_scale only support (128, 128)" + assert ( + w1.dtype == torch.float8_e4m3fnuz + ), "asm_moe for block_scale only support float8_e4m3fnuz weight" + assert ( + w2.shape[2] * 2 == w1.shape[1] + ), "aiter moe for block_scale only support g1u1" + scale_blk_n, scale_blk_k = block_shape + hidden_states = hidden_states.view(M * model_dim // scale_blk_k, scale_blk_k) + + a1_q, a1_scale = pertoken_quant( + hidden_states.view(-1, model_dim // scale_blk_k, scale_blk_k), + quant_dtype=torch.float8_e4m3fnuz, + ) + a1_q = a1_q.view(-1, model_dim) + a1_scale = a1_scale.squeeze(-1).t().contiguous() + + scale_blk_n, scale_blk_k = block_shape + aiter.fmoe_fp8_blockscale_g1u1( + moe_buf, + a1_q, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + a1_scale, + fc1_scale, + fc2_scale, + scale_blk_n, + scale_blk_k, + None, + ) + else: + # a8w8 fmoe, opt: smooth quant + a8_type = ( + w1.dtype + if w1.dtype != dtypes.i32 and w1.dtype != torch.uint32 + else dtypes.fp8 + ) + if fc1_smooth_scale is not None: + a8 = torch.empty((topk * M, model_dim), dtype=a8_type, device=device) + a8_scale = torch.empty((topk * M), dtype=dtypes.fp32, device=device) + + # moe_smoothquant_fwd need topk_ids which contains local_expert_id + if expert_mask is not None: + local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) + local_expert_hash[local_expert_hash > 0] -= 1 + topk_ids = local_expert_hash[topk_ids] + + aiter.moe_smoothquant_fwd( + a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale + ) + else: + if ( + w1.dtype == dtypes.fp8 + or w1.dtype == dtypes.i32 + and w1.dtype == torch.uint32 + ): + a8 = torch.empty((M, model_dim), dtype=a8_type, device=device) + a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) + if per_tensor_quant_scale is None: + aiter.dynamic_per_token_scaled_quant(a8, hidden_states, a8_scale) + else: + aiter.static_per_tensor_quant( + a8, hidden_states, per_tensor_quant_scale + ) + a8_scale.fill_(per_tensor_quant_scale) + elif w1.dtype == dtypes.i8: + a8 = torch.empty((M, model_dim), dtype=w1.dtype, device=device) + a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) + fc1_smooth_scale = torch.ones( + model_dim, dtype=dtypes.fp32, device=device + ) + aiter.smoothquant_fwd(a8, hidden_states, fc1_smooth_scale, a8_scale) + else: + logger.warning("FMOE fall into pure torch quant...") + a8, a8_scale = aiter.pertoken_quant(hidden_states, quant_dtype=w1.dtype) + if w2.shape[2] * lastdim_mul == w1.shape[1]: + fmoe_func = aiter.fmoe_int8_g1u0 + elif w2.shape[2] * 2 * lastdim_mul == w1.shape[1]: + fmoe_func = aiter.fmoe_g1u1 + else: + raise ValueError( + f"Invalid MoE weight: {w1.shape=} {w2.shape=} {lastdim_mul}" + ) + + fmoe_func( + moe_buf, + a8, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + a8_scale, + fc1_scale, + fc2_scale, + fc2_smooth_scale, + activation, + ) + # fc2_smooth_scale) + return moe_buf + + +def asm_moe_tkw1( + hidden_states, + w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk_weight, + topk_ids, + # following for int8 quant + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] + fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] + a16=False, + per_tensor_quant_scale=None, + expert_mask=None, + activation=ActivationType.Silu, +): + E, model_dim, inter_dim = w2.shape + global_E = E + if expert_mask is not None: + global_E = expert_mask.numel() + M, topk = topk_ids.shape + dtype = hidden_states.dtype + device = topk_ids.device + lastdim_mul = 8 if w1.dtype in {dtypes.i32, torch.uint32} else 1 + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck( + topk_ids, topk_weight, global_E, model_dim, dtype, BLOCK_SIZE_M, expert_mask + ) + ) + + if fc1_scale is None: + # pure bf16 + aiter.fmoe( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + ) + elif a16: + # a16w8 smooth quant fmoe + if w1.dtype == dtypes.fp8 and inter_dim * 2 == w1.shape[1]: + aiter.fmoe_fp8_g1u1_a16( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + fc1_scale, + fc2_scale, + fc1_smooth_scale, + fc2_smooth_scale, + ) + elif w1.dtype == dtypes.i8 and inter_dim == w1.shape[1]: + aiter.fmoe_int8_g1u0_a16( + moe_buf, + hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + fc1_scale, + fc2_scale, + fc1_smooth_scale, + fc2_smooth_scale, + ) + else: + raise ValueError(f"Invalid args: {w1.dtype} {w1.shape=} {w2.shape=}") + + else: + # a8w8 fmoe, opt: smooth quant + a8_type = ( + w1.dtype + if w1.dtype != dtypes.i32 and w1.dtype != torch.uint32 + else dtypes.fp8 + ) + if fc1_smooth_scale is not None: + a8 = torch.empty((topk * M, model_dim), dtype=a8_type, device=device) + a8_scale = torch.empty((topk * M), dtype=dtypes.fp32, device=device) + + # moe_smoothquant_fwd need topk_ids which contains local_expert_id + if expert_mask is not None: + local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) + local_expert_hash[local_expert_hash > 0] -= 1 + topk_ids = local_expert_hash[topk_ids] + + aiter.moe_smoothquant_fwd( + a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale + ) + else: + if ( + w1.dtype == dtypes.fp8 + or w1.dtype == dtypes.i32 + and w1.dtype == torch.uint32 + ): + a8 = torch.empty((M, model_dim), dtype=a8_type, device=device) + a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) + if per_tensor_quant_scale is None: + aiter.dynamic_per_token_scaled_quant(a8, hidden_states, a8_scale) + else: + aiter.static_per_tensor_quant( + a8, hidden_states, per_tensor_quant_scale + ) + a8_scale.fill_(per_tensor_quant_scale) + elif w1.dtype == dtypes.i8: + a8 = torch.empty((M, model_dim), dtype=w1.dtype, device=device) + a8_scale = torch.empty(M, dtype=dtypes.fp32, device=device) + fc1_smooth_scale = torch.ones( + model_dim, dtype=dtypes.fp32, device=device + ) + aiter.smoothquant_fwd(a8, hidden_states, fc1_smooth_scale, a8_scale) + else: + logger.warning("FMOE fall into pure torch quant...") + a8, a8_scale = aiter.pertoken_quant(hidden_states, quant_dtype=w1.dtype) + if w2.shape[2] * 2 * lastdim_mul == w1.shape[1]: + fmoe_func = aiter.fmoe_g1u1_tkw1 + + else: + raise ValueError( + f"Invalid MoE weight: {w1.shape=} {w2.shape=} {lastdim_mul}" + ) + + fmoe_func( + moe_buf, + a8, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + topk, + a8_scale, + fc1_scale, + fc2_scale, + fc2_smooth_scale, + activation, + ) + # fc2_smooth_scale) + return moe_buf + + +def get_block_size(token, topk, expert): + token_per_expert = token * topk / expert + support_list = [32, 64, 128] + for el in support_list: + if token_per_expert <= el * 4: + return el + return support_list[-1] + + +# Only support fp8 per tensor quant +def ck_moe_2stages( + a1, + w1, # [expert(local_expert:EP), inter_dim(*2), dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk_weight, + topk_ids, + # following for int8 quant + quant_type=QuantType.No, + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + a1_scale=None, # [1] + a2_scale=None, # [1] + block_size=None, + expert_mask=None, + activation=ActivationType.Silu, + doweight_stage1=False, +): + + quant_func = get_hip_quant(quant_type) + q_dtype_a = w1.dtype if w1.dtype != torch.uint32 else torch.float8_e4m3fnuz + + # quant_func = get_torch_quant(quant_type) + E, model_dim, inter_dim = w2.shape + if w1.dtype is torch.uint32: + inter_dim = inter_dim * 8 + + global_E = E + if expert_mask is not None: + global_E = expert_mask.numel() + M, topk = topk_ids.shape + dtype = a1.dtype + device = topk_ids.device + if block_size is None: + block_size = get_block_size(M, topk, E) + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck( + topk_ids, topk_weight, global_E, model_dim, dtype, block_size, expert_mask + ) + ) + # print("block_size:", block_size, sorted_expert_ids) + a1, a1_scale = quant_func(a1, scale=a1_scale, quant_dtype=q_dtype_a) + + a2 = torch.empty( + (M, topk, inter_dim), + dtype=dtype, + device=device, + ) + + if activation == ActivationType.Silu: + act_op = 1 # silu_and_mul + else: + act_op = 0 # gelu_and_mul + + aiter.ck_moe_stage1( + a1, + w1, + w2, + sorted_ids, + sorted_expert_ids, + num_valid_ids, + a2, + topk, + fc1_scale, + a1_scale, + block_size, + sorted_weights if doweight_stage1 else None, + act_op, + ) + + if quant_type == QuantType.per_Token: + a2 = a2.view(M, -1) + a2, a2_scale = quant_func(a2, scale=a2_scale, quant_dtype=q_dtype_a) + a2 = a2.view(M, topk, -1) + + aiter.ck_moe_stage2( + a2, + w1, + w2, + sorted_ids, + sorted_expert_ids, + num_valid_ids, + moe_buf, + topk, + fc2_scale, + a2_scale, + block_size, + sorted_weights if not doweight_stage1 else None, + ) + return moe_buf + + +def torch_moe( + hidden_states, + w1, + w2, + topk_weight, + topk_ids, + # following for int8 quant + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] + fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] + expert_mask=None, + activation=ActivationType.Silu, +): + computeType = dtypes.fp32 + dtype = hidden_states.dtype + hidden_states = hidden_states.to(computeType) + w1 = w1.to(computeType) + w2 = w2.to(computeType) + B, D = hidden_states.shape + topk = topk_weight.shape[1] + if expert_mask is not None: + local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) - 1 + local_expert_hash[expert_mask == 0] = -1 + topk_ids = local_expert_hash[topk_ids] + + hidden_states = hidden_states.view(B, -1, D).repeat(1, topk, 1) + out = torch.zeros( + (B, topk, D), + dtype=computeType, + device=hidden_states.device, + ) + + inter_dim = w2.shape[2] + if w2.shape[2] * 2 == w1.shape[1]: + # g1u1(w1 include gate and up) + moeType = "g1u1" + else: + # g1u0(w1 only include gate) + moeType = "g1u0" + + if fc1_scale is not None: + # gose to quant D_w8a8/w8a8 + expert = w1.shape[0] + w2D = w2.shape[-1] + w1 = (w1.view(-1, D) * fc1_scale.view(-1, 1)).view(expert, -1, D) + w2 = (w2.view(-1, w2D) * fc2_scale.view(-1, 1)).view(expert, -1, w2D) + + if fc1_smooth_scale is not None: + expert = fc1_smooth_scale.shape[0] + fc1_smooth_scale = fc1_smooth_scale.view(expert, -1) + fc2_smooth_scale = fc2_smooth_scale.view(expert, -1) + + for E_id in range(w1.shape[0]): + mask = topk_ids == E_id + if mask.sum(): + sub_tokens = hidden_states[mask] + if fc1_smooth_scale is not None: + sub_tokens = sub_tokens * (fc1_smooth_scale[E_id]) + + act_input = sub_tokens @ (w1[E_id].transpose(0, 1)) + if moeType == "g1u1": + gate, up = act_input.split([inter_dim, inter_dim], dim=-1) + if activation == ActivationType.Gelu: + act_out = F.gelu(gate) * up + else: + act_out = F.silu(gate) * up + else: + if activation == ActivationType.Gelu: + act_out = F.gelu(act_input) + else: + act_out = F.silu(act_input) + if fc2_smooth_scale is not None: + act_out = act_out * (fc2_smooth_scale[E_id]) + out[mask] = act_out @ (w2[E_id].transpose(0, 1)) + + return (out * topk_weight.view(B, -1, 1)).sum(dim=1).to(dtype) + + +def torch_moe_tkw1( + hidden_states, + w1, + w2, + topk_weight, + topk_ids, + # following for int8 quant + fc1_scale=None, # [expert(local_expert:EP), inter_dim, 1] + fc2_scale=None, # [expert(local_expert:EP), model_dim, 1] + fc1_smooth_scale=None, # [expert(local_expert:EP), 1, model_dim] + fc2_smooth_scale=None, # [expert(local_expert:EP), 1, inter_dim] + expert_mask=None, + activation=ActivationType.Silu, +): + computeType = dtypes.fp32 + dtype = hidden_states.dtype + hidden_states = hidden_states.to(computeType) + w1 = w1.to(computeType) + w2 = w2.to(computeType) + B, D = hidden_states.shape + topk = topk_weight.shape[1] + if expert_mask is not None: + local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32) - 1 + local_expert_hash[expert_mask == 0] = -1 + topk_ids = local_expert_hash[topk_ids] + + hidden_states = hidden_states.view(B, -1, D).repeat(1, topk, 1) + out = torch.zeros( + (B, topk, D), + dtype=computeType, + device=hidden_states.device, + ) + + inter_dim = w2.shape[2] + if w2.shape[2] * 2 == w1.shape[1]: + # g1u1(w1 include gate and up) + moeType = "g1u1" + else: + # g1u0(w1 only include gate) + moeType = "g1u0" + + if fc1_scale is not None: + # gose to quant D_w8a8/w8a8 + expert = w1.shape[0] + w2D = w2.shape[-1] + w1 = (w1.view(-1, D) * fc1_scale.view(-1, 1)).view(expert, -1, D) + w2 = (w2.view(-1, w2D) * fc2_scale.view(-1, 1)).view(expert, -1, w2D) + + if fc1_smooth_scale is not None: + expert = fc1_smooth_scale.shape[0] + fc1_smooth_scale = fc1_smooth_scale.view(expert, -1) + fc2_smooth_scale = fc2_smooth_scale.view(expert, -1) + + for E_id in range(w1.shape[0]): + mask = topk_ids == E_id + if mask.sum(): + sub_tokens = hidden_states[mask] + if fc1_smooth_scale is not None: + sub_tokens = sub_tokens * (fc1_smooth_scale[E_id]) + + act_input = sub_tokens @ (w1[E_id].transpose(0, 1)) + if moeType == "g1u1": + gate, up = act_input.split([inter_dim, inter_dim], dim=-1) + gate = gate * (topk_weight.view(B, -1, 1)[mask]) + up = up * (topk_weight.view(B, -1, 1)[mask]) + if activation == ActivationType.Gelu: + act_out = F.gelu(gate) * up + else: + act_out = F.silu(gate) * up + else: + if activation == ActivationType.Gelu: + act_out = F.gelu(act_input) + else: + act_out = F.silu(act_input) + if fc2_smooth_scale is not None: + act_out = act_out * (fc2_smooth_scale[E_id]) + act_out, act_out_scale = pertoken_quant( + act_out, quant_dtype=dtypes.fp8, dtypeMax=None + ) + out[mask] = ( + act_out.to(computeType) + @ (w2[E_id].transpose(0, 1)) + * act_out_scale.view(-1, 1) + ) + + return out.sum(dim=1).to(dtype) + + +def fused_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + topk_ids: Optional[torch.Tensor] = None, + topk_weights: Optional[torch.Tensor] = None, +): + assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch" + + M, _ = hidden_states.shape + + if topk_weights is None: + topk_weights = torch.empty( + M, topk, dtype=dtypes.fp32, device=hidden_states.device + ) + if topk_ids is None: + topk_ids = torch.empty(M, topk, dtype=dtypes.i32, device=hidden_states.device) + token_expert_indicies = torch.empty( + M, topk, dtype=dtypes.i32, device=hidden_states.device + ) + + aiter.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + renormalize, + ) + del token_expert_indicies # Not used. Will be used in the future. + + # if renormalize: + # topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + return topk_weights, topk_ids diff --git a/aiter/fused_moe_asm_wna16.py b/aiter/fused_moe_asm_wna16.py new file mode 100644 index 0000000000000000000000000000000000000000..1391683f65daf6f840ad0170720427ab566c7de8 --- /dev/null +++ b/aiter/fused_moe_asm_wna16.py @@ -0,0 +1,1133 @@ +import torch +import torch.nn.functional as F +import ctypes +from typing import Optional, Dict +import os +import threading +import pandas as pd +import functools +import aiter +from bisect import bisect_left +from aiter import logger +from aiter import per_token_quant_hip, per_block_quant_wrapper, get_hip_quant +from aiter import ActivationType, QuantType, dtypes +from aiter import silu_and_mul,gelu_and_mul +from aiter.ops.triton.fused_moe import ( + triton_moe_sum, + triton_silu_and_mul, + triton_gelu_and_mul +) + +from aiter.jit.core import AITER_ROOT_DIR +# from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size +# from vllm.model_executor.layers.quantization.utils.int8_utils import ( +# per_token_group_quant_int8, per_token_quant_int8) +from aiter.ops.triton.group_quant_int8 import per_token_group_quant_int8 +from aiter.jit.utils.chip_info import get_gfx, get_cu_num +from functools import lru_cache +from aiter.jit.utils.torch_guard import torch_compile_guard + + +def moe_sorting_ck( + topk_ids, + topk_weights, + num_experts, + model_dim, + moe_buf, + block_size=32, + expert_mask=None, +): + device = topk_ids.device + M, topk = topk_ids.shape + topk = topk_ids.shape[1] + max_num_tokens_padded = topk_ids.numel() + num_experts * block_size - topk + max_num_m_blocks = int((max_num_tokens_padded + block_size - 1) // block_size) + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=dtypes.i32, device=device) + sorted_weights = torch.empty( + (max_num_tokens_padded,), dtype=dtypes.fp32, device=device + ) + sorted_expert_ids = torch.empty( + (max_num_m_blocks,), dtype=dtypes.i32, device=device + ) + tokens_positions_per_expert = torch.empty( + (num_experts*2,), dtype=dtypes.i32, device=device + ) + num_valid_ids = torch.empty((1), dtype=dtypes.i32, device=device) + +# for now, moe_sorting_fwd only support int32 topk_ids + if topk_ids.dtype != dtypes.i32: + topk_ids = topk_ids.to(dtypes.i32) + + aiter.moe_sorting_fwd( + topk_ids, + topk_weights, + sorted_ids, + sorted_weights, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + moe_buf, + num_experts, + block_size, + expert_mask, + ) + return sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf + +#@staticmethod +def run_fused_experts_asm_impl(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + dtype: torch.dtype, + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, + use_persist: bool = False, + persist_cu: Optional[int] = 0, + use_shuffle: Optional[int] = 0, + solution_id: Optional[str] = None)-> torch.Tensor: + return fused_experts_asm_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + dtype, + inplace, + activation, + use_fp8_w8a8, + use_int8_w8a8, + use_int8_w4a8, + use_int8_w8a16, + use_int4_w4a16, + per_channel_quant, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1_scale, + a2_scale, + block_shape, + use_persist, + persist_cu, + use_shuffle, + solution_id + ) + +def fused_moe_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + dtype, + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, + use_persist: bool = False, + persist_cu: Optional[int] = 0, + use_shuffle: Optional[int] = 0, + solution_id: Optional[str] = None +) -> torch.Tensor: + device = topk_ids.device + M, topk = topk_ids.shape + dtype = dtype + # E, model_dim, inter_dim = get_inter_dim(w1.shape, w2.shape) + # FIXME: W2.size must be same as hidden_dim + moe_buf = torch.empty((M, w2.size(1)), dtype=dtype, device=device) + return moe_buf + + + + +@torch_compile_guard(gen_fake=fused_moe_fake) +def fused_experts_asm_impl(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + dtype: torch.dtype, + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list[int]] = None, + use_persist: bool = False, + persist_cu: Optional[int] = 0, + use_shuffle: Optional[int] = 0, + solution_id: Optional[str] = None, + routed_scaling_factor: Optional[float] = 1.0)-> torch.Tensor: + # Check constraints. + if use_int8_w4a8: + assert block_shape[0] == 0 and block_shape[1] == 64, "[ERROR]ASM Fused MoE only support w4a8 block_shape=64 now." + + if use_shuffle: + assert use_fp8_w8a8 or use_int8_w8a8 or (not use_int4_w4a16 and not use_int4_w4a16), "[ERROR]ASM Fused MoE only support f8 now." + + if use_int4_w4a16 or use_int8_w4a8: + assert hidden_states.shape[1] // 2 == w1.shape[ + 2], "Hidden size mismatch" + else: + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn + ] + + num_tokens, _ = hidden_states.shape + E, N, _ = w1.shape + _, model_dim, inter_dim = w2.shape + if global_num_experts == -1: + global_num_experts = E + top_k_num = topk_ids.shape[1] + # We execute the fused_moe kernel in chunks to circumvent this issue: + # https://github.com/vllm-project/vllm/issues/5938 + # need to change according to token + CHUNK_SIZE = 65536 + M = min(num_tokens, CHUNK_SIZE) + + out_hidden_states = torch.empty((num_tokens, model_dim), dtype=dtype, device=hidden_states.device) + for chunk in range((num_tokens // CHUNK_SIZE) + 1): + begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE, + min((chunk + 1) * CHUNK_SIZE, + num_tokens)) + curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] + tokens_in_chunk, _ = curr_hidden_states.shape + + if tokens_in_chunk == 0: + break + + + curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] + curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] + + #FIXME: just for EP Accuracy Test + if expert_map is not None: + d_w1_out = torch.zeros((curr_hidden_states.size(0) * top_k_num, w1.size(1)), dtype=dtype, device=curr_hidden_states.device) + if use_int4_w4a16 or use_int8_w4a8: + d_silu = torch.zeros((curr_hidden_states.size(0) * top_k_num, w2.size(2) * 2), dtype=dtype, device=curr_hidden_states.device) + else: + d_silu = torch.zeros((curr_hidden_states.size(0) * top_k_num, w2.size(2)), dtype=dtype, device=curr_hidden_states.device) + d_w2_out = torch.zeros((curr_hidden_states.size(0), top_k_num, w2.size(1)), dtype=dtype, device=curr_hidden_states.device) + else: + d_w1_out = torch.empty((curr_hidden_states.size(0) * top_k_num, w1.size(1)), dtype=dtype, device=curr_hidden_states.device) + if use_int4_w4a16 or use_int8_w4a8: + d_silu = torch.empty((curr_hidden_states.size(0) * top_k_num, w2.size(2) * 2), dtype=dtype,device=curr_hidden_states.device) + else: + d_silu = torch.empty((curr_hidden_states.size(0) * top_k_num, w2.size(2)), dtype=dtype, device=curr_hidden_states.device) + d_w2_out = torch.empty((curr_hidden_states.size(0), top_k_num, w2.size(1)), dtype=dtype, device=curr_hidden_states.device) + + arch = get_gfx() + cu_num = get_cu_num() + odtype = 0 + if dtype == torch.bfloat16: + odtype = 1 + if use_persist: + if persist_cu <= 0 or persist_cu >= cu_num: + persist_cu = cu_num + else: + persist_cu = 0 + # INT4 w4a16 + if use_int4_w4a16: + if solution_id is None: + solution_id = get_moe_asm_solution(arch, tokens_in_chunk, N/2, w1.size(2)*2, E, top_k_num, MoeQuantType.INT4_W4A16) + if block_shape is not None and block_shape[1] == 32: + config = decode_sol_w4a16_gw32() + else: + config = decode_sol_w4a16(solution_id) + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, model_dim, out_hidden_states[begin_chunk_idx:end_chunk_idx], config["BLOCK_SIZE_M"], expert_map) + ) + if print_log(): + print(f"Asm Moe Size: chunk:{chunk}, arch:{arch}, quant:{MoeQuantType.INT4_W4A16}, tokens:{tokens_in_chunk}, inter_dim:{int(N/2)}, model_dim:{w1.size(2)*2}, expert:{E}, topk:{top_k_num}") + print(f"solution:{solution_id}, shuffle:{use_shuffle}, persist:{persist_cu}") + if solution_id == "default": + print(f">>> Warning: No matching config pattern found, using default asm solution.") + solution_id = None + + if dtype == torch.bfloat16: + if block_shape is not None and block_shape[1] == 32: + aiter.asm_fmoe_stage1(d_w1_out, + curr_hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + w1_scale, + w1_scale, + w1_zp, + 4, + config["SOL_ID1"], + config["BLOCK_SIZE_M"]) + else: + aiter.asm_fmoe_stage1(d_w1_out, + curr_hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + w1_scale, + w1_scale, + w1_zp, + 3, + config["SOL_ID1"], + config["BLOCK_SIZE_M"]) + else: + aiter.asm_fmoe_stage1(d_w1_out, + curr_hidden_states, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + w1_scale, + w1_scale, + w1_zp, + 2, + config["SOL_ID1"], + config["BLOCK_SIZE_M"]) + if activation == "silu": + triton_silu_and_mul(d_silu,d_w1_out) + # silu_and_mul(d_silu,d_w1_out) + elif activation == "gelu": + triton_gelu_and_mul(d_silu,d_w1_out) + # gelu_and_mul(d_silu,d_w1_out) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + if dtype == torch.bfloat16: + if block_shape is not None and block_shape[1] == 32: + aiter.asm_fmoe_stage2(d_w2_out, + d_silu, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + w2_scale, + w2_scale, + w2_zp, + 4, + config["SOL_ID2"], + config["BLOCK_SIZE_M"]) + else: + aiter.asm_fmoe_stage2(d_w2_out, + d_silu, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + w2_scale, + w2_scale, + w2_zp, + 3, + config["SOL_ID2"], + config["BLOCK_SIZE_M"]) + else: + aiter.asm_fmoe_stage2(d_w2_out, + d_silu, + w1, + w2, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + w2_scale, + w2_scale, + w2_zp, + 2, + config["SOL_ID2"], + config["BLOCK_SIZE_M"]) + #int8 channel wise + elif use_int8_w8a8 and per_channel_quant: + if solution_id is None: + solution_id = get_moe_asm_solution(arch, tokens_in_chunk, N/2, w1.size(2), E, top_k_num, MoeQuantType.INT8_W8A8_C, use_shuffle) + config = decode_sol_w8a8_c(solution_id) + if persist_cu == cu_num: + calculate_persist_groups(persist_cu, config, MoeQuantType.INT8_W8A8_C) + else: + config["PERSIST_GROUP1"] = persist_cu + config["PERSIST_GROUP2"] = persist_cu + + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, model_dim, out_hidden_states[begin_chunk_idx:end_chunk_idx], config["BLOCK_SIZE_M"], expert_map) + ) + if print_log(): + print(f"Asm Moe Size: chunk:{chunk}, arch:{arch}, quant:{MoeQuantType.INT8_W8A8_C}, tokens:{tokens_in_chunk}, inter_dim:{int(N/2)}, model_dim:{w1.size(2)}, expert:{E}, topk:{top_k_num}") + print(f"solution:{solution_id}, shuffle:{use_shuffle}, persist:{persist_cu}") + if solution_id== "default": + print(f">>> Warning: No matching config pattern found, using default asm solution.") + solution_id = None + + if curr_hidden_states.dtype == torch.float16 or curr_hidden_states.dtype == torch.bfloat16: + input_q,input_scale = per_token_quant_hip(curr_hidden_states) + #input_q,input_scale = per_token_quant_int8(curr_hidden_states) + else: + input_q,input_scale = curr_hidden_states,a1_scale + aiter.asm_fmoe_a8(d_w1_out, + input_q, + w1, + w1, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + input_scale, + w1_scale, + w1_zp, + 0, + config["SOL_ID1"], + odtype, + config["PERSIST_GROUP1"], + use_shuffle) + if activation == "silu": + triton_silu_and_mul(d_silu,d_w1_out) + # silu_and_mul(d_silu,d_w1_out) + elif activation == "gelu": + triton_gelu_and_mul(d_silu,d_w1_out) + # gelu_and_mul(d_silu,d_w1_out) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + bridge_q,bridge_scale = per_token_quant_hip(d_silu) + #bridge_q,bridge_scale = per_token_quant_int8(d_silu) + aiter.asm_fmoe_a8(d_w2_out, + bridge_q, + w2, + w2, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + bridge_scale, + w2_scale, + w2_zp, + 1, + config["SOL_ID2"], + odtype, + config["PERSIST_GROUP2"], + use_shuffle) + #w4a8 block wise = 64 + elif use_int8_w4a8: + if solution_id is None: + solution_id = get_moe_asm_solution(arch, tokens_in_chunk, N/2, w1.size(2)*2, E, top_k_num, MoeQuantType.INT4_W4A8) + config = decode_sol_0(solution_id) + if persist_cu == cu_num: + calculate_persist_groups(persist_cu, config, MoeQuantType.INT4_W4A8) + else: + config["PERSIST_GROUP1"] = persist_cu + config["PERSIST_GROUP2"] = persist_cu + + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, model_dim, out_hidden_states[begin_chunk_idx:end_chunk_idx], config["BLOCK_SIZE_M"], expert_map) + ) + if print_log(): + print(f"Asm Moe Size: chunk:{chunk}, arch:{arch}, quant:{MoeQuantType.INT4_W4A8}, tokens:{tokens_in_chunk}, inter_dim:{int(N/2)}, model_dim:{w1.size(2)*2}, expert:{E}, topk:{top_k_num}") + print(f"solution:{solution_id}, shuffle:{use_shuffle}, persist:{persist_cu}") + if solution_id== "default": + print(f">>> Warning: No matching config pattern found, using default asm solution.") + solution_id = None + + if curr_hidden_states.dtype == torch.float16 or curr_hidden_states.dtype==torch.bfloat16: + input_q,input_scale = per_token_group_quant_int8(curr_hidden_states, block_shape[1]) + else: + input_q,input_scale = curr_hidden_states,a1_scale + #quant_func = get_hip_quant(QuantType.per_1x64) + #input_q,input_scale = quant_func(curr_hidden_states, quant_dtype=dtypes.i8) + # input_q,input_scale = per_token_group_quant_int8(curr_hidden_states, block_shape[1]) + + aiter.asm_fmoe_a8(d_w1_out, + input_q, + w1, + w1, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + input_scale, + w1_scale, + w1_zp, + 10, + config["SOL_ID1"], + odtype, + config["PERSIST_GROUP1"]) + if activation == "silu": + triton_silu_and_mul(d_silu,d_w1_out) + # silu_and_mul(d_silu,d_w1_out) + elif activation == "gelu": + triton_gelu_and_mul(d_silu,d_w1_out) + # gelu_and_mul(d_silu,d_w1_out) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + #quant_func = get_hip_quant(QuantType.per_1x64) + #bridge_q,bridge_scale = quant_func(d_silu, quant_dtype=dtypes.i8) + bridge_q,bridge_scale = per_token_group_quant_int8(d_silu, block_shape[1]) + + aiter.asm_fmoe_a8(d_w2_out, + bridge_q, + w2, + w2, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + bridge_scale, + w2_scale, + w2_zp, + 11, + config["SOL_ID2"], + odtype, + config["PERSIST_GROUP2"]) + #int8 block wise = 128 + elif use_int8_w8a8: + if solution_id is None: + solution_id = get_moe_asm_solution(arch, tokens_in_chunk, N/2, w1.size(2), E, top_k_num, MoeQuantType.INT8_W8A8, use_shuffle) + config = decode_sol_0(solution_id, use_shuffle) + if persist_cu == cu_num: + calculate_persist_groups(persist_cu, config, MoeQuantType.INT8_W8A8) + else: + config["PERSIST_GROUP1"] = persist_cu + config["PERSIST_GROUP2"] = persist_cu + + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, model_dim, out_hidden_states[begin_chunk_idx:end_chunk_idx], config["BLOCK_SIZE_M"], expert_map) + ) + if print_log(): + print(f"Asm Moe Size: chunk:{chunk}, arch:{arch}, quant:{MoeQuantType.INT8_W8A8}, tokens:{tokens_in_chunk}, inter_dim:{int(N/2)}, model_dim:{w1.size(2)}, expert:{E}, topk:{top_k_num}") + print(f"solution:{solution_id}, shuffle:{use_shuffle}, persist:{persist_cu}") + if solution_id== "default": + print(f">>> Warning: No matching config pattern found, using default asm solution.") + solution_id = None + + if curr_hidden_states.dtype == torch.float16 or curr_hidden_states.dtype==torch.bfloat16: + input_q,input_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(curr_hidden_states, quant_dtype=torch.int8) + else: + input_q,input_scale = curr_hidden_states,a1_scale + aiter.asm_fmoe_a8(d_w1_out, + input_q, + w1, + w1, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + input_scale, + w1_scale, + w1_zp, + 2, + config["SOL_ID1"], + odtype, + config["PERSIST_GROUP1"], + use_shuffle) + if activation == "silu": + triton_silu_and_mul(d_silu,d_w1_out) + # silu_and_mul(d_silu,d_w1_out) + elif activation == "gelu": + triton_gelu_and_mul(d_silu,d_w1_out) + # gelu_and_mul(d_silu,d_w1_out) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + #FIXME: aiter quant method performance is little worse than triton. Change it latter!! + bridge_q, bridge_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(d_silu, quant_dtype=torch.int8) + # bridge_q,bridge_scale = per_token_group_quant_int8(d_silu, block_shape[1]) + aiter.asm_fmoe_a8(d_w2_out, + bridge_q, + w2, + w2, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + bridge_scale, + w2_scale, + w2_zp, + 3, + config["SOL_ID2"], + odtype, + config["PERSIST_GROUP2"], + use_shuffle) + #f8 channel wise + elif use_fp8_w8a8 and per_channel_quant: + if solution_id is None: + solution_id = get_moe_asm_solution(arch, tokens_in_chunk, N/2, w1.size(2), E, top_k_num, MoeQuantType.F8_W8A8_C, use_shuffle) + config = decode_sol_w8a8_c(solution_id) + if persist_cu == cu_num: + calculate_persist_groups(persist_cu, config, MoeQuantType.F8_W8A8_C) + else: + config["PERSIST_GROUP1"] = persist_cu + config["PERSIST_GROUP2"] = persist_cu + + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, model_dim, out_hidden_states[begin_chunk_idx:end_chunk_idx], config["BLOCK_SIZE_M"], expert_map) + ) + if print_log(): + print(f"Asm Moe Size: chunk:{chunk}, arch:{arch}, quant:{MoeQuantType.F8_W8A8_C}, tokens:{tokens_in_chunk}, inter_dim:{int(N/2)}, model_dim:{w1.size(2)}, expert:{E}, topk:{top_k_num}") + print(f"solution:{solution_id}, shuffle:{use_shuffle}, persist:{persist_cu}") + if solution_id== "default": + print(f">>> Warning: No matching config pattern found, using default asm solution.") + solution_id = None + + if curr_hidden_states.dtype == torch.float16 or curr_hidden_states.dtype==torch.bfloat16: + input_q,input_scale = per_token_quant_hip(curr_hidden_states, quant_dtype=torch.float8_e4m3fn) + else: + input_q,input_scale = curr_hidden_states,a1_scale + aiter.asm_fmoe_a8(d_w1_out, + input_q, + w1, + w1, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + input_scale, + w1_scale, + w1_zp, + 4, + config["SOL_ID1"], + odtype, + config["PERSIST_GROUP1"], + use_shuffle) + if activation == "silu": + triton_silu_and_mul(d_silu,d_w1_out) + # silu_and_mul(d_silu,d_w1_out) + elif activation == "gelu": + triton_gelu_and_mul(d_silu,d_w1_out) + # gelu_and_mul(d_silu,d_w1_out) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + bridge_q,bridge_scale= per_token_quant_hip(d_silu, quant_dtype=torch.float8_e4m3fn) + aiter.asm_fmoe_a8(d_w2_out, + bridge_q, + w2, + w2, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + bridge_scale, + w2_scale, + w2_zp, + 5, + config["SOL_ID2"], + odtype, + config["PERSIST_GROUP2"], + use_shuffle) + #f8 block wise = 128 + elif use_fp8_w8a8: + if solution_id is None: + solution_id = get_moe_asm_solution(arch, tokens_in_chunk, N/2, w1.size(2), E, top_k_num, MoeQuantType.F8_W8A8, use_shuffle) + config = decode_sol_0(solution_id, use_shuffle) + if persist_cu == cu_num: + calculate_persist_groups(persist_cu, config, MoeQuantType.F8_W8A8) + else: + config["PERSIST_GROUP1"] = persist_cu + config["PERSIST_GROUP2"] = persist_cu + + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, model_dim, out_hidden_states[begin_chunk_idx:end_chunk_idx], config["BLOCK_SIZE_M"], expert_map) + ) + if print_log(): + print(f"Asm Moe Size: chunk:{chunk}, arch:{arch}, quant:{MoeQuantType.F8_W8A8}, tokens:{tokens_in_chunk}, inter_dim:{int(N/2)}, model_dim:{w1.size(2)}, expert:{E}, topk:{top_k_num}") + print(f"solution:{solution_id}, shuffle:{use_shuffle}, persist:{persist_cu}") + if solution_id== "default": + print(f">>> Warning: No matching config pattern found, using default asm solution.") + solution_id = None + + if curr_hidden_states.dtype == torch.float16 or curr_hidden_states.dtype==torch.bfloat16: + input_q,input_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(curr_hidden_states, quant_dtype=torch.float8_e4m3fn) + else: + input_q,input_scale = curr_hidden_states,a1_scale + aiter.asm_fmoe_a8(d_w1_out, + input_q, + w1, + w1, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + input_scale, + w1_scale, + w1_zp, + 6, + config["SOL_ID1"], + odtype, + config["PERSIST_GROUP1"], + use_shuffle) + if activation == "silu": + triton_silu_and_mul(d_silu,d_w1_out) + # silu_and_mul(d_silu,d_w1_out) + elif activation == "gelu": + triton_gelu_and_mul(d_silu,d_w1_out) + # gelu_and_mul(d_silu,d_w1_out) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + bridge_q,bridge_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(d_silu, quant_dtype=torch.float8_e4m3fn) + aiter.asm_fmoe_a8(d_w2_out, + bridge_q, + w2, + w2, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + bridge_scale, + w2_scale, + w2_zp, + 7, + config["SOL_ID2"], + odtype, + config["PERSIST_GROUP2"], + use_shuffle) + # + else: + if solution_id is None: + solution_id = get_moe_asm_solution(arch, tokens_in_chunk, N/2, w1.size(2), E, top_k_num, MoeQuantType.NO_QUANT, use_shuffle) + config = decode_sol_w8a8_c(solution_id) + if persist_cu == cu_num: + calculate_persist_groups(persist_cu, config, MoeQuantType.NO_QUANT) + else: + config["PERSIST_GROUP1"] = persist_cu + config["PERSIST_GROUP2"] = persist_cu + + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, model_dim, out_hidden_states[begin_chunk_idx:end_chunk_idx], config["BLOCK_SIZE_M"], expert_map) + ) + if print_log(): + print(f"Asm Moe Size: chunk:{chunk}, arch:{arch}, quant:{MoeQuantType.NO_QUANT}, tokens:{tokens_in_chunk}, inter_dim:{int(N/2)}, model_dim:{w1.size(2)}, expert:{E}, topk:{top_k_num}") + print(f"solution:{solution_id}, shuffle:{use_shuffle}, persist:{persist_cu}") + if solution_id== "default": + print(f">>> Warning: No matching config pattern found, using default asm solution.") + solution_id = None + + aiter.asm_fmoe_a8(d_w1_out, + curr_hidden_states, + w1, + w1, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + None, + None, + None, + 20, + config["SOL_ID1"], + odtype, + config["PERSIST_GROUP1"], + use_shuffle) + #return d_w1_out + if activation == "silu": + triton_silu_and_mul(d_silu,d_w1_out) + # silu_and_mul(d_silu,d_w1_out) + elif activation == "gelu": + triton_gelu_and_mul(d_silu,d_w1_out) + # gelu_and_mul(d_silu,d_w1_out) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + aiter.asm_fmoe_a8(d_w2_out, + d_silu, + w2, + w2, + sorted_ids, + curr_topk_weights, + sorted_expert_ids, + num_valid_ids, + top_k_num, + None, + None, + None, + 21, + config["SOL_ID2"], + odtype, + config["PERSIST_GROUP2"], + use_shuffle) + triton_moe_sum(d_w2_out, moe_buf if not inplace else hidden_states, routed_scaling_factor) + + return out_hidden_states if not inplace else hidden_states + +@lru_cache(maxsize=1) +def print_log(): + value = os.getenv("ASM_MOE_LOG") + if value is not None: + return True + else: + return False + +class MoeQuantType: + NO_QUANT = "no_quant" + INT4_W4A16 = "int4_w4a16" + INT4_W4A8 = "int4_w4a8" + INT8_W8A8 = "int8_w8a8_block" + INT8_W8A8_C = "int8_w8a8_channel" + F8_W8A8 = "f8_w8a8_block" + F8_W8A8_C = "f8_w8a8_channel" + + ALL_TYPES = [NO_QUANT, INT4_W4A16, INT4_W4A8, INT8_W8A8, INT8_W8A8_C, F8_W8A8, F8_W8A8_C] + + @classmethod + def is_valid(cls, qtype_str: str) -> bool: + return qtype_str in cls.ALL_TYPES + + @classmethod + def get_default(cls) -> str: + return cls.NO_QUANT + +_cached_data_by_quant = {} +_data_lock = threading.RLock() + +CSV_FILE_MAPPING = { + "no_quant": "tuned_fmoe_asm.csv", + "int4_w4a16": "tuned_fmoe_asm_w4a16.csv", + "int4_w4a8": "tuned_fmoe_asm_w4a8_group.csv", + "int8_w8a8_block": "tuned_fmoe_asm_w8a8_group.csv", + "int8_w8a8_channel": "tuned_fmoe_asm_w8a8_channel.csv", + "f8_w8a8_block": "tuned_fmoe_asm_w8a8_group.csv", + "f8_w8a8_channel": "tuned_fmoe_asm_w8a8_channel.csv", + "no_quant_s": "tuned_fmoe_asm_shuffle.csv", + "int8_w8a8_block_s": "tuned_fmoe_asm_w8a8_group_shuffle.csv", + "int8_w8a8_channel_s": "tuned_fmoe_asm_w8a8_channel_shuffle.csv", + "f8_w8a8_block_s": "tuned_fmoe_asm_w8a8_group_shuffle.csv", + "f8_w8a8_channel_s": "tuned_fmoe_asm_w8a8_channel_shuffle.csv", +} + +def get_csv_path(quant_type): + if quant_type in CSV_FILE_MAPPING: + filename = CSV_FILE_MAPPING[quant_type] + else: + filename = f"{quant_type}.csv" + return os.path.join(AITER_ROOT_DIR, "aiter", "configs", filename) + +def load_and_cache_csv_for_quant(quant_type, use_shuffle): + + global _cached_data_by_quant + + if use_shuffle == 1: + quant_type_file = quant_type + "_s" + else: + quant_type_file = quant_type + csv_path = get_csv_path(quant_type_file) + + if not os.path.exists(csv_path): + print(f"Asm moe tuned csv not found: {csv_path}") + return False + + with _data_lock: + try: + if (quant_type_file in _cached_data_by_quant): + return True + + print(f"Load asm moe tuned csv: {csv_path}") + moe_asm_cfg = pd.read_csv(csv_path) + + # Group by key parameters + group_cols = ['arch', 'inter_dim', 'model_dim', 'expert', 'topk', 'quant_type'] + cached_groups = {} + + for group_key, group_df in moe_asm_cfg.groupby(group_cols): + # Ensure tokens are integers and sorted in ascending order + group_df = group_df.sort_values('token') + tokens_array = group_df['token'].values.astype(int) + sol_ids_array = group_df['sol_id'].values + + # Store in cache + cached_groups[group_key] = (tokens_array, sol_ids_array, group_df) + + # Update cache + _cached_data_by_quant[quant_type_file] = cached_groups + + return True + + except Exception as e: + print(f"Load asm moe tuned csv failed {csv_path}: {e}") + return False + +@lru_cache(maxsize=4096) +def get_moe_asm_solution( + arch, + token, + inter_dim, + model_dim, + expert, + topk, + quant_type, + use_shuffle=0, + q_size_n=0, + q_size_k=0 +): + if not load_and_cache_csv_for_quant(quant_type, use_shuffle): + return "default" + + with _data_lock: + if use_shuffle == 1: + quant_type_file = quant_type + "_s" + else: + quant_type_file = quant_type + cached_groups = _cached_data_by_quant.get(quant_type_file) + if cached_groups is None: + return "default" + + cache_key = (str(arch), int(inter_dim), int(model_dim), + int(expert), int(topk), str(quant_type)) + + if cache_key not in cached_groups: + return "default" + + tokens_array, sol_ids_array, _ = cached_groups[cache_key] + token_int = int(token) + n = len(tokens_array) + + if n == 0: + return "default" + + if token_int <= tokens_array[0]: + return sol_ids_array[0] + if token_int >= tokens_array[-1]: + return sol_ids_array[-1] + + # Binary search + idx = bisect_left(tokens_array, token_int) + + # Exact match + if idx < n and tokens_array[idx] == token_int: + return sol_ids_array[idx] + + # No exact match, find the closest token + left_idx = idx - 1 + right_idx = idx + + left_dist = token_int - tokens_array[left_idx] + right_dist = tokens_array[right_idx] - token_int + + if left_dist <= right_dist: + return sol_ids_array[left_idx] + else: + return sol_ids_array[right_idx] +def decode_sol_w4a16(solution) -> Dict[str, int]: + if solution == "default": + config = { + "SOL_ID1": 11002, + "SOL_ID2": 21002, + "BLOCK_SIZE_M": 32, + } + return config + + parts = solution.split("+") + if len(parts) == 2: + sol_id1 = int(parts[0]) + sol_id2 = int(parts[1]) + else: + raise ValueError("Invalid solution_id") + + if 10000 <= sol_id1 <= 10999: + block_size_m = 16 + elif 11000 <= sol_id1 <= 11999: + block_size_m = 32 + elif 12000 <= sol_id1 <= 12999: + block_size_m = 64 + elif 13000 <= sol_id1 <= 13999: + block_size_m = 128 + else: + raise ValueError(f"key1 value {sol_id1} is not in the expected ranges (10000-13999)") + config = { + "SOL_ID1": sol_id1, + "SOL_ID2": sol_id2, + "BLOCK_SIZE_M": block_size_m, + } + return config + +def decode_sol_w4a16_gw32() -> Dict[str, int]: + config = { + "SOL_ID1": 50032, + "SOL_ID2": 60032, + "BLOCK_SIZE_M": 32, + } + return config + + +def decode_sol_0(solution, use_shuffle=0) -> Dict[str, int]: + if solution == "default": + config = { + "SOL_ID1": 10000, + "SOL_ID2": 20000, + "BLOCK_SIZE_M": 16, + } + if use_shuffle: + config["SOL_ID1"] = 10001 + return config + + parts = solution.split("+") + if len(parts) == 2: + sol_id1 = int(parts[0]) + sol_id2 = int(parts[1]) + else: + raise ValueError("Invalid solution_id") + + if 10000 <= sol_id1 <= 10999: + block_size_m = 16 + elif 11000 <= sol_id1 <= 11999: + block_size_m = 32 + elif 12000 <= sol_id1 <= 12999: + block_size_m = 64 + elif 13000 <= sol_id1 <= 13999: + block_size_m = 128 + elif 14000 <= sol_id1 <= 14999: + block_size_m = 256 + else: + raise ValueError(f"key1 value {sol_id1} is not in the expected ranges (10000-13999)") + config = { + "SOL_ID1": sol_id1, + "SOL_ID2": sol_id2, + "BLOCK_SIZE_M": block_size_m, + } + return config + +def decode_sol_w8a8_c(solution) -> Dict[str, int]: + + if solution == "default": + config = { + "SOL_ID1": 11000, + "SOL_ID2": 21001, + "BLOCK_SIZE_M": 32, + } + return config + + parts = solution.split("+") + if len(parts) == 2: + sol_id1 = int(parts[0]) + sol_id2 = int(parts[1]) + else: + raise ValueError("Invalid solution_id") + + if 10000 <= sol_id1 <= 10999: + block_size_m = 16 + elif 11000 <= sol_id1 <= 11999: + block_size_m = 32 + elif 12000 <= sol_id1 <= 12999: + block_size_m = 64 + elif 13000 <= sol_id1 <= 13999: + block_size_m = 128 + else: + raise ValueError(f"sol_id1 value {sol_id1} is not in the expected ranges (10000-12999)") + config = { + "SOL_ID1": sol_id1, + "SOL_ID2": sol_id2, + "BLOCK_SIZE_M": block_size_m, + } + return config + +def calculate_persist_groups(persist_cu, config, quant_type): + + # Maximum number of sol_id workgroups executable per CU. + if quant_type == MoeQuantType.INT4_W4A8: + sol_id_table = { + 20000: 4, + 21000: 4, + 22000: 2, + 23000: 2, + } + elif quant_type == MoeQuantType.INT8_W8A8_C or quant_type == MoeQuantType.F8_W8A8_C: + sol_id_table = { + 20000: 7, + 20001: 5, + 21005: 7, + 21006: 5, + 22003: 4, + 22004: 3, + 23000: 3, + 23001: 2, + 23002: 2, + } + elif quant_type == MoeQuantType.INT8_W8A8 or quant_type == MoeQuantType.F8_W8A8: + sol_id_table = { + 20000: 4, + 21000: 4, + 22000: 2, + 23000: 2, + 23001: 4, + 24000: 2, + 24001: 4, + } + else: + return + + for i in [1, 2]: + if config[f"SOL_ID{i}"] in sol_id_table: + config[f"PERSIST_GROUP{i}"] = persist_cu * sol_id_table[config[f'SOL_ID{i}']] + else: + config[f"PERSIST_GROUP{i}"] = persist_cu \ No newline at end of file diff --git a/aiter/fused_moe_c.py b/aiter/fused_moe_c.py new file mode 100644 index 0000000000000000000000000000000000000000..e86a082f4b829788e0a30890cb154151ed5591a3 --- /dev/null +++ b/aiter/fused_moe_c.py @@ -0,0 +1,3833 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Fused MoE kernel.""" +import functools +import json +import os +from typing import Any, Callable, Dict, List, Optional, Tuple +from contextlib import contextmanager + +import time +import torch +import triton +import triton.language as tl +import statistics +import logging + +import time +from aiter.test_common import perftest +import aiter +from aiter import dtypes +from aiter import moe_c_silu_and_mul,moe_c_moe_sum, per_token_quant_hip +from aiter.jit.utils.torch_guard import torch_compile_guard +from aiter.ops.triton.fused_moe import triton_moe_sum +from triton.language.extra import libdevice + + +logger = logging.getLogger(__name__) + +if not logger.handlers: + # 设置日志级别(DEBUG/INFO/WARNING/ERROR) + logger.setLevel(logging.INFO) + # 定义日志格式(包含时间、模块、级别、内容) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + # 添加控制台输出 handler + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + +_global_config: Optional[Dict] = None + +@contextmanager +def override_config(config: Dict): + global _global_config + old_config = _global_config # 保存当前配置 + _global_config = config # 应用新配置 + yield # 执行上下文内的代码 + _global_config = old_config # 退出上下文时恢复原配置 + +def get_config() -> Optional[Dict]: + return _global_config + +def scaled_fp8_quant(A: torch.tensor, A_scale: torch.tensor): # todo + pass + +def get_device_name(): + if torch.cuda.is_available(): + # 获取当前默认 CUDA 设备名称(如 "NVIDIA A100-SXM4-80GB") + device_name = torch.cuda.get_device_name(0) + else: + # 若没有 CUDA,返回 CPU 名称(简化处理) + device_name = "cpu" + # 替换空格为下划线(与原逻辑一致) + return device_name.replace(" ", "_") + + +#int8 per token量化 +@triton.jit +def _per_token_quant_int8( + x_ptr, + xq_ptr, + scale_ptr, + stride_x, + stride_xq, + N, + BLOCK: tl.constexpr, +): + row_id = tl.program_id(0) + + cols = tl.arange(0, BLOCK) + mask = cols < N + + x = tl.load(x_ptr + row_id * stride_x + cols, + mask=mask, other=0.0).to(tl.float32) + absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10) + scale_x = absmax / 127 + x_q = x * (127 / absmax) + # x_q = tl.extra.cuda.libdevice.nearbyint(x_q).to(tl.int8) + x_q = libdevice.nearbyint(x_q).to(tl.int8) + + tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask) + tl.store(scale_ptr + row_id, scale_x) + + +def per_token_quant_int8(x): + M = x.numel() // x.shape[-1] + N = x.shape[-1] + x_q = torch.empty_like(x, device=x.device, dtype=torch.int8) + scales = torch.empty(x.shape[:-1] + (1,), + device=x.device, dtype=torch.float32) + BLOCK = triton.next_power_of_2(N) + # heuristics for number of warps + num_warps = min(max(BLOCK // 256, 1), 8) + + assert x.is_contiguous() + _per_token_quant_int8[(M,)]( + x, + x_q, + scales, + stride_x=x.stride(-2), + stride_xq=x_q.stride(-2), + N=N, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + + return x_q, scales + +# ck_sorting算子 +def moe_sorting_ck( + topk_ids, + topk_weights, + num_experts, + model_dim, + moebuf_dtype, + block_size=32, + expert_mask=None, +): + device = topk_ids.device + M, topk = topk_ids.shape + topk = topk_ids.shape[1] + max_num_tokens_padded = topk_ids.numel() + num_experts * block_size - topk + max_num_m_blocks = int((max_num_tokens_padded + block_size - 1) // block_size) + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=dtypes.i32, device=device) + sorted_weights = torch.empty( + (max_num_tokens_padded,), dtype=dtypes.fp32, device=device + ) + sorted_expert_ids = torch.empty( + (max_num_m_blocks,), dtype=dtypes.i32, device=device + ) + tokens_positions_per_expert = torch.empty( + (num_experts*2,), dtype=dtypes.i32, device=device + ) + num_valid_ids = torch.empty((1), dtype=dtypes.i32, device=device) + moe_buf = torch.empty((M, model_dim), dtype=moebuf_dtype, device=device) + +# for now, moe_sorting_fwd only support int32 topk_ids + if topk_ids.dtype != dtypes.i32: + topk_ids = topk_ids.to(dtypes.i32) + + aiter.moe_sorting_fwd( + topk_ids, + topk_weights, + sorted_ids, + sorted_weights, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + moe_buf, + num_experts, + block_size, + expert_mask, + ) + return sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf + + +def moe_kernel_prepare_input( + A: torch.Tensor, + B: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: Optional[torch.Tensor], + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool= False, + use_int8_w4a8: bool= False, + use_int8_w8a16: bool= False, + use_int4_w4a16: bool= False, + per_channel_quant: bool= False, + block_shape: Optional[List[int]] = None, +) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """准备MOE kernel的输入""" + if use_int8_w8a8: + assert B_scale is not None + if block_shape is None: + # 激活channel-wise int8量化 + assert (per_channel_quant), "int8 quantization only supports block or channel-wise" + A, A_scale = per_token_quant_int8(A) + else: + # 激活block-wise int8量化 + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + elif use_int8_w4a8: + assert B_scale is not None + if block_shape is None: + # 激活channel-wise int8量化 + assert (per_channel_quant), "int8 quantization only supports block or channel-wise" + A, A_scale = per_token_quant_int8(A) + else: + # 激活block-wise int8量化 + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + elif use_fp8_w8a8: + assert B_scale is not None + if block_shape is None: + # 激活channel-wise int8量化 + assert (per_channel_quant), "int8 quantization only supports block or channel-wise" + block_k = A.shape[-1] + A, A_scale = per_token_group_quant_int8(A, block_k) + else: + # 激活block-wise int8量化 + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + elif use_int8_w8a16 or use_int4_w4a16: + assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + + return A, A_scale + +def get_fp8_dtype(): + if not torch.cuda.is_available(): + return None # CPU 不支持 FP8 + + # 检查设备是否支持 FP8(需 CUDA 11.8+ 且 GPU 为 Hopper 及以上架构) + major, minor = torch.cuda.get_device_capability(0) + if (major, minor) >= (9, 0): # Hopper 及以上架构(如 H100) + return torch.float8_e4m3fn # 常用 FP8 类型 + else: + return None # 不支持 FP8 的设备返回 None + +def get_compile_backend(): + if torch.cuda.is_available(): + # CUDA 设备优先用 inductor(PyTorch 默认高效后端) + return "inductor" + else: + # CPU 可用 aot_eager 或 inductor + return "aot_eager" + +def per_token_group_quant_fp8( + x: torch.Tensor, + group_size: int, + eps: float = 1e-10, + dtype: Optional[torch.dtype] = None, + column_major_scales: bool = False, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Function to perform per-token-group quantization on an input tensor `x`. + It converts the tensor values into signed float8 values and returns the + quantized tensor along with the scaling factor used for quantization. + Args: + x: The input tensor with ndim >= 2. + group_size: The group size used for quantization. + eps: The minimum to avoid dividing zero. + dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn` + is supported for now. + Returns: + Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the + scaling factor for quantization. + """ + dtype = torch.float8_e4m3fn #current_platform.fp8_dtype() if dtype is None else dtype + assert (x.shape[-1] % group_size == 0), ( + f"the last dimension of `x` {x.shape[-1]} must be divisible " + f"by `group_size` {group_size}") + assert x.stride(-1) == 1, "`x` groups must be contiguous" + + finfo = torch.finfo(dtype) + fp8_min = finfo.min + fp8_max = finfo.max + + x_q = torch.empty_like(x, device=x.device, dtype=torch.float32) + M = x.numel() // group_size + N = group_size + if column_major_scales: + shape = (x.shape[-1] // group_size, ) + x.shape[:-1] + x_s = torch.empty(shape, device=x.device, + dtype=torch.float32).permute(-1, -2) + else: + shape = x.shape[:-1] + (x.shape[-1] // group_size, ) + x_s = torch.empty(shape, device=x.device, dtype=torch.float32) + + BLOCK = triton.next_power_of_2(N) + # heuristics for number of warps + num_warps = min(max(BLOCK // 256, 1), 8) + num_stages = 1 + if column_major_scales: + _per_token_group_quant_fp8_colmajor[(M, )]( + x, + x_q, + x_s, + group_size, + x.shape[1], + x.stride(0), + x_s.stride(1), + eps, + fp8_min=fp8_min, + fp8_max=fp8_max, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=num_stages, + ) + else: + _per_token_group_quant_fp8[(M, )]( + x, + x_q, + x_s, + group_size, + x.shape[1], + x.stride(0), + eps, + fp8_min=fp8_min, + fp8_max=fp8_max, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=num_stages, + ) + x_q = x_q.to(torch.float8_e4m3fn) + + return x_q, x_s + +@triton.jit +def _per_token_group_quant_fp8_colmajor( + # Pointers to inputs and output + y_ptr, + y_q_ptr, + y_s_ptr, + group_size, + # Num columns of y + y_num_columns, + y_row_stride, + # Stride from one column to the next of y_s + y_s_col_stride, + # Avoid to divide zero + eps, + # Information for float8 + fp8_min, + fp8_max, + # Meta-parameters + BLOCK: tl.constexpr, +): + """A Triton-accelerated function to perform per-token-group + quantization on a tensor. + This function converts the tensor values into float8 values. + """ + groups_per_row = y_num_columns // group_size + + # Map the program id to the row of X and Y it should compute. + g_id = tl.program_id(0) + row = g_id // groups_per_row + row_g_id = g_id % groups_per_row + + y_ptr += (row * y_row_stride) + (row_g_id * group_size) + y_q_ptr += g_id * group_size + + # Convert g_id the flattened block coordinate to 2D so we can index + # into the output y_scales matrix + blocks_per_row = y_num_columns // group_size + scale_col = g_id % blocks_per_row + scale_row = g_id // blocks_per_row + y_s_ptr += scale_col * y_s_col_stride + scale_row + + cols = tl.arange(0, BLOCK) # group_size <= BLOCK + mask = cols < group_size + + y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32) + # Quant + _absmax = tl.maximum(tl.max(tl.abs(y)), eps) + y_s = _absmax / fp8_max + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + cols, y_q, mask=mask) + tl.store(y_s_ptr, y_s) + +@triton.jit +def _per_token_group_quant_fp8( + # Pointers to inputs and output + y_ptr, + y_q_ptr, + y_s_ptr, + group_size, + # Num columns of y + y_num_columns, + y_row_stride, + # Avoid to divide zero + eps, + # Information for float8 + fp8_min, + fp8_max, + # Meta-parameters + BLOCK: tl.constexpr, +): + """A Triton-accelerated function to perform per-token-group + quantization on a tensor. + This function converts the tensor values into float8 values. + """ + groups_per_row = y_num_columns // group_size + + # Map the program id to the row of X and Y it should compute. + g_id = tl.program_id(0) + row = g_id // groups_per_row + row_g_id = g_id % groups_per_row + + y_ptr += (row * y_row_stride) + (row_g_id * group_size) + y_q_ptr += g_id * group_size + y_s_ptr += g_id + + cols = tl.arange(0, BLOCK) # N <= BLOCK + mask = cols < group_size + + y = tl.load(y_ptr + cols, mask=mask, other=0.0) + y = tl.cast(y,tl.float32) + # Quant + _absmax = tl.maximum(tl.max(tl.abs(y)), eps) + y_s = _absmax / fp8_max + y_q = tl.clamp(y / y_s, fp8_min, fp8_max) + + tl.store(y_q_ptr + cols, y_q, mask=mask) + tl.store(y_s_ptr, y_s) + + +def per_token_group_quant_int8( + x: torch.Tensor, + group_size: int, + eps: float = 1e-10, + dtype: torch.dtype = torch.int8, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Function to perform per-token-group quantization on an input tensor `x`. + + It converts the tensor values into signed int8 values and returns the + quantized tensor along with the scaling factor used for quantization. + + Args: + x: The input tenosr with ndim >= 2. + group_size: The group size used for quantization. + eps: The minimum to avoid dividing zero. + dtype: The dype of output tensor. Note that only `torch.int8` + is supported for now. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the + scaling factor for quantization. + """ + assert (x.shape[-1] % group_size == 0 + ), "the last dimension of `x` cannot be divisible by `group_size`" + assert x.is_contiguous(), "`x` is not contiguous" + + iinfo = torch.iinfo(dtype) + int8_max = iinfo.max + int8_min = iinfo.min + + x_q = torch.empty_like(x, device=x.device, dtype=dtype) + x_s = torch.empty( + x.shape[:-1] + (x.shape[-1] // group_size, ), + device=x.device, + dtype=torch.float32, + ) + + M = x.numel() + configs = get_w8a8_group_quant_configs(M, group_size) + if False :#configs: + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + config = { + "BLOCK_SIZE": 128, + "num_warps": 1, + "num_stages": 1, + } + + grid = lambda META: ( + triton.cdiv(M, META['BLOCK_SIZE']), + ) + _per_token_group_quant_int8[grid]( + x, + x_q, + x_s, + M, + eps, + int8_min=int8_min, + int8_max=int8_max, + GROUP_SIZE=group_size, + **config + ) + + return x_q, x_s + + +@triton.jit +def _per_token_group_quant_int8( + # Pointers to inputs and output + y_ptr, + y_q_ptr, + y_s_ptr, + M, + # Avoid to divide zero + eps, + int8_min, + int8_max, + GROUP_SIZE: tl.constexpr, + BLOCK_SIZE: tl.constexpr +): + """A Triton-accelerated function to perform per-token-group + quantization on a tensor. + + This function converts the tensor values into int8 values. + """ + g_id = tl.program_id(0) + y_ptr += g_id * BLOCK_SIZE + y_q_ptr += g_id * BLOCK_SIZE + S_NUM: tl.constexpr = BLOCK_SIZE // GROUP_SIZE + y_s_ptr += g_id * S_NUM + + cols = tl.arange(0, BLOCK_SIZE) # N <= BLOCK_SIZE + s_cols = tl.arange(0, S_NUM) + mask = g_id * BLOCK_SIZE + cols < M + + y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32) + y = tl.reshape(y, (S_NUM, GROUP_SIZE)) + # Quant + _absmax = tl.maximum(tl.max(tl.abs(y), axis=1), eps) + y_s = (_absmax / int8_max).reshape(S_NUM, 1) + y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty) + + y_q = tl.reshape(y_q, (S_NUM * GROUP_SIZE)) + y_s = tl.reshape(y_s, (S_NUM)) + + tl.store(y_q_ptr + cols, y_q, mask=mask) + tl.store(y_s_ptr + s_cols, y_s.to(y_s_ptr.dtype.element_ty)) + +@functools.lru_cache +def get_w8a8_group_quant_configs( + M: int, groupSize: int +) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + config_file_path = get_w8a8_group_quant_config_filepath(M, groupSize) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + #logger.info( + # "Using configuration from %s for W8A8 GROUP QUANT kernel.", + # config_file_path, + #) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ( + "Using default W8A8 GROUP QUANT kernel config. Performance might " + "be sub-optimal! Config file not found at %s" + ), + config_file_path, + ) + return None + +@functools.lru_cache +def get_w8a8_group_quant_config_filepath(M: int, GROUP_SIZE: int) -> str: + device_name = get_device_name() + + if device_name.lower().startswith("bw"): + device_name = "BW200" + json_file_name = f"w8a8_per_token_group_quant_device_name={device_name},group_size={GROUP_SIZE}.json" + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "../configs/quant_configs", json_file_name + ) + return config_file_path + +# def per_token_quant_int8(x): +# M = x.numel() // x.shape[-1] +# N = x.shape[-1] +# x_q = torch.empty_like(x, device=x.device, dtype=torch.int8) +# scales = torch.empty(x.shape[:-1] + (1,), +# device=x.device, dtype=torch.float32) +# BLOCK = triton.next_power_of_2(N) +# # heuristics for number of warps +# num_warps = min(max(BLOCK // 256, 1), 8) + +# assert x.is_contiguous() +# _per_token_quant_int8[(M,)]( +# x, +# x_q, +# scales, +# stride_x=x.stride(-2), +# stride_xq=x_q.stride(-2), +# N=N, +# BLOCK=BLOCK, +# num_warps=num_warps, +# num_stages=1, +# ) + +# return x_q, scales + +# @triton.jit +# def _per_token_quant_int8( +# x_ptr, +# xq_ptr, +# scale_ptr, +# stride_x, +# stride_xq, +# N, +# BLOCK: tl.constexpr, +# ): +# row_id = tl.program_id(0) + +# cols = tl.arange(0, BLOCK) +# mask = cols < N + +# x = tl.load(x_ptr + row_id * stride_x + cols, +# mask=mask, other=0.0).to(tl.float32) +# absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10) +# scale_x = absmax / 127 +# x_q = x * (127 / absmax) +# # x_q = tl.extra.cuda.libdevice.nearbyint(x_q).to(tl.int8) +# x_q = libdevice.nearbyint(x_q).to(tl.int8) + +# tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask) +# tl.store(scale_ptr + row_id, scale_x) + +@triton.jit +def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token, + token_mask, BLOCK_SIZE_M, BLOCK_SIZE_N, + compute_type): + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int64) + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def fused_moe_kernel_gptq_awq( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + b_scale_ptr, + b_zp_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_bse, + stride_bsk, + stride_bsn, + stride_bze, + stride_bzk, + stride_bzn, + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + group_size: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + has_zp: tl.constexpr, + use_int4_w4a16: tl.constexpr, + use_int8_w8a16: tl.constexpr): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsk >= 0) + tl.assume(stride_bsn >= 0) + tl.assume(stride_bze >= 0) + tl.assume(stride_bzk >= 0) + tl.assume(stride_bzn >= 0) + + # to notify the compiler that sorted_token_ids_ptr is a pointer to the memory, + # and all value in the memory is non-negative. + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + tl.static_assert(COMBINE_SCALE_LOAD == False, "COMBINE_SCALE_LOAD not support for awq!") + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: + return + + offs_token_id = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)).to(tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, + offs_token, token_mask, BLOCK_SIZE_M, + BLOCK_SIZE_N, compute_type) + return + + tl.assume(off_experts >= 0) + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N)).to(tl.int32) % N + offs_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int32) + + + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int64) + else: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int32) + + if use_int4_w4a16: + b_ptrs = b_ptr + (off_experts * stride_be + \ + (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * \ + stride_bn).to(tl.int32) + b_shifter = (offs_k[:, None] % 2) * 4 + elif use_int8_w8a16: + b_ptrs = b_ptr + (off_experts * stride_be + \ + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn).to(tl.int32) + + if not has_zp and use_int4_w4a16: + b_zp_num = 8 + if not has_zp and use_int8_w8a16: + b_zp_num = 128 + elif has_zp and use_int4_w4a16: + b_zp_shifter = (offs_bn[None, :] % 2) * 4 + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + + if not block_k_diviable: + k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K + k_other = 0.0 + else: + k_mask = None + k_other = None + + a = tl.load(a_ptrs, + mask=token_mask[:, None] & + (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs) + if use_int4_w4a16: + b = (b >> b_shifter) & 0xF + + b_scale_ptrs = b_scale_ptr + (off_experts * stride_bse + \ + offs_bn[None, :] * stride_bsn + \ + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * \ + stride_bsk).to(tl.int32) + b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other) + b_scale = b_scale.to(tl.float32) + + if has_zp and use_int4_w4a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = b_zp_ptr + (off_experts * stride_bze + \ + (offs_bn[None, :] // 2) * stride_bzn + \ + offs_k_true * stride_bzk).to(tl.int32) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = ((b_zp >> b_zp_shifter) & 0xF) + b_zp = b_zp.to(tl.float32) + elif has_zp and use_int8_w8a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = b_zp_ptr + (off_experts * stride_bze + \ + offs_bn[None, :] * stride_bzn + \ + offs_k_true * stride_bzk).to(tl.int32) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = b_zp.to(tl.float32) + + # We accumulate along the K dimension. + if has_zp: + b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type) + else: + b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type) + + accumulator = tl.dot(a, b, acc=accumulator) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + if use_int4_w4a16: + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + else: + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + if USE_ADDR_OFFSET_INT64_C: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int64) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + + if block_n_diviable: + c_mask = token_mask[:, None] + else: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def fused_moe_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsk >= 0) + tl.assume(stride_bsn >= 0) + + # to notify the compiler that sorted_token_ids_ptr is a pointer to the memory, + # and all value in the memory is non-negative. + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + if group_k > 0: + tl.static_assert(BLOCK_SIZE_K <= group_k and group_k % BLOCK_SIZE_K == 0, + "BLOCK_SIZE_K must be divisible by GROUP_SIZE_K") + if COMBINE_SCALE_LOAD: # used for use_int8_w8a8 + tl.static_assert(stride_ask == 1, + "COMBINE_SCALE_LOAD implictly stride_ask == 1!") + tl.static_assert(MUL_ROUTED_WEIGHT == False, + "COMBINE_SCALE_LOAD and MUL_ROUTED_WEIGHT cannot be both true due to w1_scale and w2_scale diff layout!") + tl.static_assert(block_k_diviable == True and BLOCK_SIZE_K == group_k, + "COMBINE_SCALE_LOAD only add and verify on block_k_diviable!") + tl.static_assert(use_int8_w8a8 == True, + "COMBINE_SCALE_LOAD only add and verify on use_int8_w8a8!") + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: + return + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int32) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, + offs_token, token_mask, BLOCK_SIZE_M, + BLOCK_SIZE_N, compute_type) + return + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)) % N + + offs_k = tl.arange(0, BLOCK_SIZE_K) + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int64) + else: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int32) + b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn).to(tl.int32) + + if use_int8_w8a16: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + if COMBINE_SCALE_LOAD: + a_scale_ptrs = a_scale_ptr + (offs_token[:, None] // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn[:, None] * stride_bsn) + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn * stride_bsn) + + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + if COMBINE_SCALE_LOAD: + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a0 = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b0 = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + else: + a0 = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b0 = tl.load(b_ptrs) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + tl.static_assert(False, "Not implemented") + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + tl.arange(0, 2) + a_scale = tl.load(a_scale_ptrs + offs_ks[None, :] * stride_ask, + mask=token_mask[:, None], + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks[None, :] * stride_bsk) + a_scale_0, a_scale_1 = tl.split(a_scale) + b_scale_0, b_scale_1 = tl.split(b_scale) + + accumulator += tl.dot(a0, b0) * a_scale_0[:, + None] * b_scale_0[None, :] + + if not block_k_diviable: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, + mask=token_mask[:, None] & (offs_k[None, :] < K - (k + 1) * BLOCK_SIZE_K), + other=0.0) + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk, + mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0) + else: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, mask=token_mask[:, None], other=0.0) + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk) + accumulator += tl.dot(a1, b1) * a_scale_1[:, + None] * b_scale_1[None, :] + else: + accumulator = tl.dot(a, b, acc=accumulator) + tl.static_assert(False, "Not implemented") + else: + accumulator += tl.dot(a, b) + tl.static_assert(False, "Not implemented") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += BLOCK_SIZE_K * stride_bk * 2 + + else: # non-COMBINE_SCALE_LOAD + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + else: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, + mask=token_mask, + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + accumulator += tl.dot(a, b) * a_scale[:, + None] * b_scale[None, :] + else: + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + if USE_ADDR_OFFSET_INT64_C: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int64) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + if not block_n_diviable: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + else: + c_mask = token_mask[:, None] + + tl.store(c_ptrs, accumulator, mask=c_mask) + + + +def ceil_div(a, b): + return (a + b - 1) // b + + +@triton.jit +def moe_align_block_size_stage1( + topk_ids_ptr, + tokens_cnts_ptr, + num_experts: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + + start_idx = pid * tokens_per_thread + + off_c = (pid + 1) * num_experts + + for i in range(tokens_per_thread): + if start_idx + i < numel: + idx = tl.load(topk_ids_ptr + start_idx + i) + token_cnt = tl.load(tokens_cnts_ptr + off_c + idx) + tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1) + + +@triton.jit +def moe_align_block_size_stage2( + tokens_cnts_ptr, + num_experts: tl.constexpr, +): + pid = tl.program_id(0) + + last_cnt = 0 + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid) + last_cnt = last_cnt + token_cnt + tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt) + + +@triton.jit +def moe_align_block_size_stage3( + total_tokens_post_pad_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, +): + last_cumsum = 0 + off_cnt = num_experts * num_experts + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1) + last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size + tl.store(cumsum_ptr + i, last_cumsum) + tl.store(total_tokens_post_pad_ptr, last_cumsum) + + +@triton.jit +def moe_align_block_size_stage4( + topk_ids_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + start_idx = tl.load(cumsum_ptr + pid) + end_idx = tl.load(cumsum_ptr + pid + 1) + + for i in range(start_idx, end_idx, block_size): + tl.store(expert_ids_ptr + i // block_size, pid) + + start_idx = pid * tokens_per_thread + off_t = pid * num_experts + + for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, + numel)): + expert_id = tl.load(topk_ids_ptr + i) + token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id) + rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id) + tl.store(sorted_token_ids_ptr + rank_post_pad, i) + tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1) + + +# Triton implementation based on: +# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0 +def moe_align_block_size_triton( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + numel = topk_ids.numel() + grid = (num_experts, ) + tokens_cnts = torch.zeros((num_experts + 1, num_experts), + dtype=torch.int32, + device=topk_ids.device) + cumsum = torch.zeros((num_experts + 1, ), + dtype=torch.int32, + device=topk_ids.device) + tokens_per_thread = ceil_div(numel, num_experts) + + moe_align_block_size_stage1[grid]( + topk_ids, + tokens_cnts, + num_experts, + numel, + tokens_per_thread, + ) + moe_align_block_size_stage2[grid]( + tokens_cnts, + num_experts, + ) + moe_align_block_size_stage3[(1, )]( + num_tokens_post_pad, + tokens_cnts, + cumsum, + num_experts, + block_size, + ) + moe_align_block_size_stage4[grid]( + topk_ids, + sorted_token_ids, + expert_ids, + tokens_cnts, + cumsum, + num_experts, + block_size, + numel, + tokens_per_thread, + ) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + block_size: int, + num_experts: int, + expert_map: torch.Tensor = None +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns the token distribution across experts to be compatible with block + size for matrix multiplication. + + Parameters: + - topk_ids: A tensor of shape [total_tokens, top_k] representing the + top-k expert indices for each token. + - block_size: The block size used in block matrix multiplication. + - num_experts: The total number of experts. + - expert_map: A tensor of shape [num_experts] that maps the expert index + from the global space to the local index space of the current + expert parallel shard. If the expert is not in the current expert + parallel shard, the mapping is set to -1. + + Returns: + - sorted_token_ids: A tensor containing the sorted token indices according + to their allocated expert. + - expert_ids: A tensor indicating the assigned expert index for each block. + - num_tokens_post_padded: The total number of tokens after padding, + ensuring divisibility by block_size. + + This function pads the number of tokens that each expert needs to process + so that it is divisible by block_size. + Padding ensures that during block matrix multiplication, the dimensions + align correctly. + + Example: + Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], + block_size = 4, and num_experts = 4: + - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, + with each expert needing to process 3 tokens. + - As block_size is 4, we pad 1 token for each expert. + - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. + - Then append padding tokens [12, 12, 12, 12] for each block. + - After sorting by expert index, we obtain token_ids + [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. + Tokens 12 are non-existent (padding) and are ignored in + the subsequent matrix multiplication. + - The padding ensures that the total number of tokens is now divisible + by block_size for proper block matrix operations. + """ + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + + sorted_ids = torch.empty((max_num_tokens_padded, ), + dtype=torch.int32, + device=topk_ids.device) + sorted_ids.fill_(topk_ids.numel()) + max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) + # Expert ids must be zeroed out to prevent index out of bounds error while + # mapping global expert ids to local expert ids in expert parallelism. + expert_ids = torch.zeros((max_num_m_blocks, ), + dtype=torch.int32, + device=topk_ids.device) + num_tokens_post_pad = torch.empty((1), + dtype=torch.int32, + device=topk_ids.device) + if num_experts >= 224: + if num_experts != 256: + moe_align_block_size_triton( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + # Currently requires num_experts=256 + aiter.moe_c_sgl_moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + aiter.moe_c_moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, + expert_ids, num_tokens_post_pad) + if expert_map is not None: + expert_ids = expert_map[expert_ids] + + return sorted_ids, expert_ids, num_tokens_post_pad + + + +# def generate_sum_configs(): +# configs = [] +# for block_m in [16, 32, 64, 128]: +# for block_n in [32, 64, 128, 256]: +# for num_warps in [2, 4, 8, 16]: +# for num_stages in [1, 2]: +# config = triton.Config({ +# 'BLOCK_SIZE_M': block_m, +# 'BLOCK_SIZE_N': block_n, +# }, num_warps=num_warps, num_stages=num_stages) +# configs.append(config) +# return configs + +# @triton.autotune( +# key=['M', 'N', 'top_k','compute_type'], +# # configs=generate_sum_configs(), +# configs = [ +# triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32 }, num_warps=4), +# triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64 }, num_warps=8), +# triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128 }, num_warps=8), +# triton.Config({'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128 }, num_warps=16), +# ], +# # perf_debug=True, +# ) + + +device_name = get_device_name() +if device_name=='K100_AI': + moe_sum_best_configs = { + # M, topK, N + (1, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, "num_warps" : 4 }, + (4, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, "num_warps" : 16}, + (16, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + (32, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + (64, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + (128, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + } +elif device_name=='BW200' or device_name.upper().startswith('BW'): + moe_sum_best_configs = { + # M, topK, N + (1, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, "num_warps" : 4} , + (4, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, "num_warps" : 16}, + (16, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + (32, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + (64, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + (128, 8, 7168): {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, "num_warps" : 16}, + } +else: + moe_sum_best_configs = {} + +def get_moe_sum_config(M, top_k, N): + + if moe_sum_best_configs: + # config = moe_sum_best_configs[min(moe_sum_best_configs.keys(), key=lambda x: abs(x[0] - M))] #torch.compile不支持 + best_key = None + min_diff = float('inf') + for key in moe_sum_best_configs.keys(): + diff = abs(key[0] - M) + if diff < min_diff: + min_diff = diff + best_key = key + config = moe_sum_best_configs[best_key] + + else: + if M < 32: + config = {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, "num_warps" : 4} + else: + config = {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, "num_warps" : 8} + + return config + +@triton.heuristics({ + 'block_m_dividable': lambda nargs: nargs['M'] % nargs['BLOCK_SIZE_M'] == 0, + 'block_n_dividable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, +}) + + +@triton.jit +def moe_sum_kernel( + # Pointers to matrices + output_ptr, # [M, N] + input_ptr, # [M, top_k, N] + # Matrix dimensions + M: tl.constexpr, + N: tl.constexpr, + top_k: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + stride_output_m, + stride_output_n, + stride_input_m, + stride_input_k, + stride_input_n, + compute_type: tl.constexpr, + block_m_dividable: tl.constexpr, + block_n_dividable: tl.constexpr, +): + """ + Args: + output_ptr: shape [M, N] + input_ptr: shape[M, top_k, N] + """ + tl.assume(stride_output_m >= 0) + tl.assume(stride_output_n >= 0) + tl.assume(stride_input_m >= 0) + tl.assume(stride_input_k >= 0) + tl.assume(stride_input_n >= 0) + + pid = tl.program_id(axis=0) + + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + mask_m = offs_m < M + mask_n = offs_n < N + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(top_k): + input_ptrs = input_ptr + (offs_m[:, None] * stride_input_m + + k * stride_input_k + offs_n[None, :] * stride_input_n).to(tl.int32) + if block_m_dividable: + x = tl.load(input_ptrs) + else: + x = tl.load(input_ptrs, + mask=mask_m[:, None] & mask_n[None, :], + other=0.0) + + acc += x + + acc = acc.to(compute_type) + + output_ptrs = output_ptr + (offs_m[:, None] * stride_output_m + + offs_n[None, :] * stride_output_n).to(tl.int32) + + if block_m_dividable and block_n_dividable: + tl.store(output_ptrs, acc) + else: + tl.store(output_ptrs, acc, mask=mask_m[:, None] & mask_n[None, :]) + + +def triton_moe_sum_noaiter(input_tensor, + output_tensor): + """ + Args: + input_tensor: [M, top_k, N] + output_tensor: [M, N] + """ + M, top_k, N = input_tensor.shape + + # 计算grid + # grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv( + # N, META['BLOCK_SIZE_N']), ) + config = get_moe_sum_config(M, top_k, N) + grid = (triton.cdiv(M, config['BLOCK_SIZE_M']) * triton.cdiv(N, config['BLOCK_SIZE_N']),) + + # Check constraints. + assert output_tensor.dtype == torch.float16 or \ + output_tensor.dtype == torch.bfloat16 or \ + output_tensor.dtype == torch.float32 + + if output_tensor.dtype == torch.float16: + compute_type = tl.float16 + elif output_tensor.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif output_tensor.dtype == torch.float32: + compute_type = tl.float32 + + assert input_tensor.is_contiguous() + assert output_tensor.is_contiguous() + assert input_tensor.shape[0] == output_tensor.shape[0] + assert input_tensor.shape[2] == output_tensor.shape[1] + + moe_sum_kernel[grid]( + output_tensor, + input_tensor, + M, + N, + top_k, + stride_output_m=output_tensor.stride(0), + stride_output_n=output_tensor.stride(1), + stride_input_m=input_tensor.stride(0), + stride_input_k=input_tensor.stride(1), + stride_input_n=input_tensor.stride(2), + compute_type=compute_type, + **config, + ) + + return output_tensor + +def invoke_fused_moe_kernel(A: torch.Tensor, + B: torch.Tensor, + B_new: torch.Tensor, + C: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: Optional[torch.Tensor], + B_zp: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + config: Dict[str, Any], + compute_type: tl.dtype, + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + use_int4_w4a16_base: bool, + BM: int, + BN: int, + BK: int, + kloops: int, + nloops: int, + is_bottom: bool, + block_shape: Optional[List[int]] = None) -> None: + find_best = os.getenv("WHICH_TO_TEST") + + assert topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + if use_fp8_w8a8: + assert B_scale is not None + + assert len(block_shape) == 2 + block_n, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] + assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] + elif use_int8_w8a8: + assert B_scale is not None + if block_shape is None: + A, A_scale = per_token_quant_int8(A) + else: + assert len(block_shape) == 2 + block_n, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] + assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] + elif use_int8_w8a16 or use_int4_w4a16: + assert B_scale is not None + # assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + + EM = sorted_token_ids.shape[0] + + if(find_best): + if A.shape[0] < BM: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.shape[0], + A.shape[0] * top_k * BM) + else: + if A.shape[0] < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.shape[0], + A.shape[0] * top_k * config["BLOCK_SIZE_M"]) + + grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv( + B.shape[1], META['BLOCK_SIZE_N']), ) + + if (use_int8_w8a16 or use_int4_w4a16 or use_int8_w8a8 or use_fp8_w8a8) and \ + block_shape is not None and block_shape[1] > 0: + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + + use_moe_wna16_cuda = should_moe_wna16_use_cuda( + num_valid_tokens=topk_ids.numel(), + group_size=block_shape[1], + num_experts=B.shape[0], + bit=4 if use_int4_w4a16 else 8) + config = config.copy() + config.update( + get_moe_wna16_block_config(config=config, + use_moe_wna16_cuda=use_moe_wna16_cuda, + num_valid_tokens=topk_ids.numel(), + size_k=A.shape[1], + size_n=B.shape[1], + num_experts=B.shape[1], + group_size=block_shape[1], + real_top_k=topk_ids.shape[1], + block_size_m=config["BLOCK_SIZE_M"])) + + if(find_best): + config["BLOCK_SIZE_M"] = BM + config["BLOCK_SIZE_N"] = BN + config["BLOCK_SIZE_K"] = BK + config["kloops"] = kloops + config["nloops"] = nloops + else: + config.setdefault("kloops", kloops) + config.setdefault("nloops", nloops) + if use_moe_wna16_cuda: + # if True: + # print("calling adding path -------") + if block_shape[0] > 1 and block_shape[1] > 1: + bit = 4 if use_int4_w4a16 else 8 + if use_int8_w8a16: + # print("calling w8a16 block wise -------") + + aiter.moe_c_moe_w8a16_gemm_block_wise(A, C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], bit) + elif use_int8_w8a8: + # print("calling w8a8 block wise kernel2 -------") + if is_bottom: + aiter.moe_c_moe_w8a8_gemm_block_wise_kernel2(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], config["kloops"],config["nloops"],bit) + else : + aiter.moe_c_moe_w8a8_gemm_block_wise(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], config["kloops"],config["nloops"],bit) + # print("finished calling w8a8 block wise kernel2 -------") + elif use_fp8_w8a8: + if is_bottom: + # print("calling w8a8 block wise -------") + A = A.view(torch.int8) + B = B.view(torch.int8) + aiter.moe_c_moe_w8a8_gemm_block_wise_kernel2_fp8(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"],config["kloops"],config["nloops"],bit) + # print("finished calling w8a8 block wise -------") + else : + # print("calling w8a8 block wise -------") + A = A.view(torch.int8) + B = B.view(torch.int8) + aiter.moe_c_moe_w8a8_gemm_block_wise_fp8(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"],config["kloops"],config["nloops"],bit) + # print("finished calling w8a8 block wise -------") + return + + + else : + bit = 4 if use_int4_w4a16 else 8 + # print("calling wna16 awq -------") + if (bit == 8 and use_int8_w8a16) : + # print("calling w8a16 awq -------") + aiter.moe_c_moe_w8a16_gemm_awq(A, C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], bit) + return + + elif use_int4_w4a16_base : + # print("calling w4a16 awq base -------") + # ops.moe_gemm_marlin_w4a16(A, B_new, C, B_scale, B_zp, None, + # sorted_token_ids, expert_ids, num_tokens_post_padded,8, 54, 1) + aiter.moe_c_moe_wna16_gemm_base(A, C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], bit) + return + else : + if is_bottom: + # print("calling w4a16 awq -------") + # ops.moe_gemm_marlin_w4a16(A, B_new, C, B_scale, B_zp, topk_weights, + # sorted_token_ids, expert_ids, num_tokens_post_padded,1, 54, 1) + aiter.moe_c_moe_wna16_gemm_2(A, C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], config["kloops"], config["nloops"], bit) + else : + # print(f"calling w4a16 awq {top_k}-------") + # ops.moe_gemm_marlin_w4a16(A, B_new, C, B_scale, B_zp, None, + # sorted_token_ids, expert_ids, num_tokens_post_padded,8, 54, 1) + aiter.moe_c_moe_wna16_gemm(A, C, B_new, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], config["kloops"], config["nloops"], bit) + # print("calling w4a16 awq end-------") + return + + + offset_max = 2**31 - 1 + use_addr_offset_int64_a = A.numel() * A.element_size() >= offset_max + use_addr_offset_int64_c = C.numel() * C.element_size() >= offset_max + if (A.numel() * A.element_size() >= offset_max or + B.numel() * B.element_size() >= offset_max or + C.numel() * C.element_size() >= offset_max): + logger.warning( + ("A,B,C numel:%ld, %ld, %ld has out of range for fused_moe_kernel kernel!" + "Use int64 for address offset."), A.numel(), B.numel(), C.numel()) + + fused_moe_kernel_gptq_awq[grid]( + A, + B, + C, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1], + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + block_k_diviable=A.shape[1] % config["BLOCK_SIZE_K"] == 0, + block_n_diviable=B.shape[1] % config["BLOCK_SIZE_N"] == 0, + group_size=block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + **config, + ) + else: + # simple check out of range, not accurate. + offset_max = 2**31 - 1 + use_addr_offset_int64_a = A.numel() * A.element_size() >= offset_max + use_addr_offset_int64_c = C.numel() * C.element_size() >= offset_max + if (A.numel() * A.element_size() >= offset_max or + B.numel() * B.element_size() >= offset_max or + C.numel() * C.element_size() >= offset_max): + logger.warning( + ("A,B,C numel:%ld, %ld, %ld has out of range for fused_moe_kernel kernel!" + "Use int64 for address offset."), A.numel(), B.numel(), C.numel()) + + + config = config.copy() + BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K") + if block_shape is not None: + BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], + block_shape[1])) + fused_moe_kernel[grid]( + A, + B, + C, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + B.shape[2], + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + A_scale.stride(0) + if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) + if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) + if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) + if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) + if B_scale is not None and B_scale.ndim >= 2 else 0, + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + block_k_diviable=A.shape[1] % BLOCK_SIZE_K == 0, + block_n_diviable=B.shape[1] % config["BLOCK_SIZE_N"] == 0, + MUL_ROUTED_WEIGHT=mul_routed_weight, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + BLOCK_SIZE_K=BLOCK_SIZE_K, + COMBINE_SCALE_LOAD=config.pop("COMBINE_SCALE_LOAD", None), + **config, + ) + +def invoke_fused_moe_kernel_marlin(A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: Optional[torch.Tensor], + B_zp: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + MODE: int, + config: Dict[str, Any], + compute_type: tl.dtype, + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w4a8 : bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + use_int4_w4a16_base: bool, + is_bottom: bool, + block_shape: Optional[List[int]] = None) -> None: + find_best = os.getenv("WHICH_TO_TEST") + assert topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + + if use_fp8_w8a8: + assert B_scale is not None + if block_shape is None: + + A, A_scale = per_token_quant_hip(A,quant_dtype=torch.float8_e4m3fn) + + + + else: + assert len(block_shape) == 2 + block_n, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] + assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] + elif use_int8_w8a8: + assert B_scale is not None + if block_shape is None: + A, A_scale = moe_kernel_prepare_input( + A=A, + B=B, + A_scale=None, + B_scale=B_scale, + use_fp8_w8a8=False, + use_int8_w8a8=True, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=True, + block_shape=None + ) + + else: + assert len(block_shape) == 2 + block_n, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] + assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] + elif use_int8_w4a8: + assert B_scale is not None + if block_shape is None: + A, A_scale = moe_kernel_prepare_input( + A=A, + B=B, + A_scale=None, + B_scale=B_scale, + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w4a8=True, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=True, + block_shape=None + ) + + else: + assert len(block_shape) == 2 + block_n, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] + assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] + elif use_int8_w8a16 or use_int4_w4a16: + assert B_scale is not None + # assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + + if (use_int8_w8a16 or use_int4_w4a16 or use_fp8_w8a8 or use_int8_w8a8 or use_int8_w4a8) : + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + + # use_moe_wna16_cuda = should_moe_wna16_use_cuda( + # num_valid_tokens=topk_ids.numel(), + # group_size=block_shape[1], + # num_experts=B.shape[0], + # bit=4 if use_int4_w4a16 else 8) + use_moe_wna16_cuda = True + + if(find_best): + config["MODE"] = MODE + else: + pass + + if use_moe_wna16_cuda: + # if True: + # print("calling adding path -------") + if block_shape and block_shape[0] > 1 and block_shape[1] > 1: + bit = 4 if use_int4_w4a16 else 8 + + if use_int8_w8a16: + # print("calling w8a16 block wise -------") + + aiter.moe_c_moe_w8a16_gemm_block_wise(A, C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], bit) + elif use_int8_w8a8: + # print("calling w8a8 block wise kernel2 -------") + if is_bottom: + aiter.moe_c_moe_w8a8_gemm_block_wise_kernel2(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], config["kloops"],config["nloops"],bit) + else : + aiter.moe_c_moe_w8a8_gemm_block_wise(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], config["kloops"],config["nloops"],bit) + # print("finished calling w8a8 block wise kernel2 -------") + elif use_fp8_w8a8: + if is_bottom: + # print("calling w8a8 block wise -------") + A = A.view(torch.int8) + B = B.view(torch.int8) + aiter.moe_c_moe_w8a8_gemm_block_wise_kernel2_fp8(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"],config["kloops"],config["nloops"],bit) + # print("finished calling w8a8 block wise -------") + else : + # print("calling w8a8 block wise -------") + A = A.view(torch.int8) + B = B.view(torch.int8) + aiter.moe_c_moe_w8a8_gemm_block_wise_fp8(A, A_scale,C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, block_shape[0], block_shape[1], top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"],config["kloops"],config["nloops"],bit) + # print("finished calling w8a8 block wise -------") + return + + + + else : + bit = 4 if (use_int4_w4a16 or use_int4_w4a16_base) else 8 + # print("calling wna16 awq -------") + if (bit == 8 and use_int8_w8a16) : + # print("calling w8a16 awq -------") + aiter.moe_c_moe_w8a16_gemm_awq(A, C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], bit) + return + + elif use_int4_w4a16_base : + # print("calling w4a16 awq base -------") + # aiter.moe_c_moe_gemm_marlin_w4a16(A, B_new, C, B_scale, B_zp, None, + # sorted_token_ids, expert_ids, num_tokens_post_padded,8, 54, 1) + aiter.moe_c_moe_wna16_gemm_base(A, C, B, B_scale, B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, expert_ids, + num_tokens_post_padded, top_k, + config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], bit) + return + else : + + + if use_int4_w4a16: + B = B.view(torch.uint32) + if is_bottom: + # print("calling w4a16 awq -------") + aiter.moe_c_moe_gemm_marlin_w4a16(A, B, C, B_scale, B_zp, topk_weights, + sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], 1) + # aiter.moe_c_moe_wna16_gemm_2(A, C, B, B_scale, B_zp, + # topk_weights if mul_routed_weight else None, + # sorted_token_ids, expert_ids, + # num_tokens_post_padded, top_k, + # config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + # config["BLOCK_SIZE_K"], config["kloops"], config["nloops"], bit) + else : + # print(f"calling w4a16 awq {top_k}-------") + aiter.moe_c_moe_gemm_marlin_w4a16(A, B, C, B_scale, B_zp, None, + sorted_token_ids, expert_ids, num_tokens_post_padded,8, config["MODE"], 1) + # aiter.moe_c_moe_wna16_gemm(A, C, B_new, B_scale, B_zp, + # topk_weights if mul_routed_weight else None, + # sorted_token_ids, expert_ids, + # num_tokens_post_padded, top_k, + # config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], + # config["BLOCK_SIZE_K"], config["kloops"], config["nloops"], bit) + # print("calling w4a16 awq end-------") + return + elif (use_int8_w8a8 and block_shape == None): + if is_bottom: + # print("B.shape",B.shape) + assert B.shape[1] in [7168,6144,4096,3072,2048] , f" K = {B.shape[1]} is not in support" + assert B.shape[2] in [128,256,384,768,2048] , f" N = {B.shape[2]} is not in support" + # print("calling w8a8 channel wise -------") + aiter.moe_c_moe_gemm_marlin_w8a8(A, B, C, A_scale, B_scale,topk_weights, + sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k) + + + else : + # print("calling w8a8 channel wise -------") + aiter.moe_c_moe_gemm_marlin_w8a8(A, B, C, A_scale, B_scale, None, + sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k) + return + elif (use_int8_w4a8 and block_shape == None): + if is_bottom: + # print("***************************************int8_w4a8_gemm2") + # print("calling w8a8 channel wise -------") + # print(expert_ids) + # expert_ids.fill_(-1) + # torch.cuda.synchronize() + # start_event = torch.cuda.Event(enable_timing=True) + # end_event = torch.cuda.Event(enable_timing=True) + # start_event.record() + + + aiter.moe_c_moe_gemm_marlin_w4a8(A, B, C, A_scale, B_scale,topk_weights, + sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"],top_k) + + # end_event.record() + # end_event.synchronize() + # gemm2_time_us = start_event.elapsed_time(end_event) * 1000 + # print("****************************gemm2_time_us") + # print(gemm2_time_us) + + else : + # print("calling w8a8 channel wise -------") + # print(expert_ids.shape) + # expert_ids.fill_(-1) + # print("***************************************int8_w4a8_gemm1") + # torch.cuda.synchronize() + # start_event = torch.cuda.Event(enable_timing=True) + # end_event = torch.cuda.Event(enable_timing=True) + # start_event.record() + + aiter.moe_c_moe_gemm_marlin_w4a8(A, B, C, A_scale, B_scale, None, + sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"],top_k) + + + # end_event.record() + # end_event.synchronize() + # gemm1_time_us = start_event.elapsed_time(end_event) * 1000 + # print("****************************gemm1_time_us") + # print(gemm1_time_us) + + return + elif (use_fp8_w8a8 and block_shape == None): + if is_bottom: + # print("calling w8a8 channel wise -------") + aiter.moe_c_moe_gemm_marlin_w8a8_fp8(A, B, C, A_scale, B_scale,topk_weights, + sorted_token_ids, expert_ids, num_tokens_post_padded,1, config["MODE"], top_k) + + + else : + # print("calling w8a8 channel wise -------") + aiter.moe_c_moe_gemm_marlin_w8a8_fp8(A, B, C, A_scale, B_scale, None, + sorted_token_ids, expert_ids, num_tokens_post_padded,top_k, config["MODE"], top_k) + return + + + + +# Adapted from: https://github.com/sgl-project/sglang/pull/2628 +def get_config_file_name(E: int, + N: int, + dtype: Optional[str], + block_shape: Optional[List[int]] = None, + is_bottom: bool = False, + use_moe_wna16_cuda: bool = False) -> str: + device_name = get_device_name() + # device_name = "BW200" + if device_name == 'BW200' or device_name.upper().startswith('BW'): + device_name = 'BW200' + dtype_selector = "" if not dtype else f",dtype={dtype}" + is_bottom_selector = ("" if is_bottom == False else ",is_bottom=True") + block_shape_selector = ("" if not block_shape or not all(block_shape) else + f",block_shape={block_shape}").replace(" ", "") + is_cuda_kernel_selector = ("" if use_moe_wna16_cuda == False else ",is_cuda_kernel=True") + return f"E={E},N={N},device_name={device_name}{dtype_selector}{is_bottom_selector}{block_shape_selector}{is_cuda_kernel_selector}.json" # noqa: E501 + + +def get_config_file_name_marlin(E: int, + N: int, + dtype: Optional[str], + block_shape: Optional[List[int]] = None, + is_bottom: bool = False, + use_moe_wna16_cuda: bool = False) -> str: + num_cus = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count + gfx_version = torch.cuda.get_device_properties(0).gcnArchName.split(':')[0] + dtype_selector = "" if not dtype else f",dtype={dtype}" + is_bottom_selector = ("" if is_bottom == False else ",is_bottom=True") + return f"E={E},N={N},gfx_version={gfx_version},num_cus={num_cus}{dtype_selector}{is_bottom_selector}.json" # noqa: E501 + + + +# Adapted from: https://github.com/sgl-project/sglang/pull/2628 +@functools.lru_cache +def get_moe_configs( + E: int, + N: int, + dtype: Optional[str], + block_n: Optional[int] = None, + block_k: Optional[int] = None, + is_bottom: bool = False, + use_moe_wna16_cuda: bool = False, +) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the fused MoE kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the fused_moe kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + block_shape = [block_n, block_k] if block_n and block_k else None + # use_moe_wna16_cuda = False + # if dtype in ["int4_w4a16", "int8_w8a16", "int8_w8a8"] : + # bit = 4 if dtype == "int4_w4a16" else 8 + # use_moe_wna16_cuda = should_moe_wna16_use_cuda(None, + # None, E, bit) + json_file_name = get_config_file_name(E, N, dtype, block_shape, is_bottom, use_moe_wna16_cuda) + + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "moe_c_configs", json_file_name) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info("Using configuration from %s for MoE layer.", + config_file_path) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + elif is_bottom: + # if config with is_bottom json file not found, try to fallback use config without bottom json. + fallback_json_file_name = get_config_file_name(E, N, dtype, block_shape) + fallback_config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "moe_c_configs", fallback_json_file_name) + + if os.path.exists(fallback_config_file_path): + with open(fallback_config_file_path) as f: + logger.info("Using fallback configuration from %s for MoE layer.", + fallback_config_file_path) + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ("Using default MoE config. Performance might be sub-optimal! " + "Config file not found at %s"), config_file_path) + return None + +@torch._dynamo.disable +@functools.lru_cache +def get_moe_configs_marlin( + E: int, + N: int, + dtype: Optional[str], + block_n: Optional[int] = None, + block_k: Optional[int] = None, + is_bottom: bool = False, + use_moe_wna16_cuda: bool = False, +) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the fused MoE kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the fused_moe kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + block_shape = [block_n, block_k] if block_n and block_k else None + # use_moe_wna16_cuda = False + # if dtype in ["int4_w4a16", "int8_w8a16", "int8_w8a8"] : + # bit = 4 if dtype == "int4_w4a16" else 8 + # use_moe_wna16_cuda = should_moe_wna16_use_cuda(None, + # None, E, bit) + json_file_name = get_config_file_name_marlin(E, N, dtype, block_shape, is_bottom, use_moe_wna16_cuda) + + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "moe_c_configs", json_file_name) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info("Using configuration from %s for MoE layer.", + config_file_path) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + elif is_bottom: + # if config with is_bottom json file not found, try to fallback use config without bottom json. + fallback_json_file_name = get_config_file_name_marlin(E, N, dtype, block_shape) + fallback_config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "moe_c_configs", fallback_json_file_name) + + if os.path.exists(fallback_config_file_path): + with open(fallback_config_file_path) as f: + logger.info("Using fallback configuration from %s for MoE layer.", + fallback_config_file_path) + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ("Using default MoE config. Performance might be sub-optimal! " + "Config file not found at %s"), config_file_path) + return None + + +def get_moe_wna16_block_config(config: Dict[str, + int], use_moe_wna16_cuda: bool, + num_valid_tokens: int, size_k: int, size_n: int, + num_experts: int, group_size: int, + real_top_k: int, block_size_m: int): + if "BLOCK_SIZE_N" in config and "BLOCK_SIZE_K" in config: + # optimal block config is set + return {} + if not use_moe_wna16_cuda: + # triton moe wna16 kernel + if num_valid_tokens // real_top_k == 1: + # if bs=1, use a smaller BLOCK_SIZE_N + return {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64} + else: + return {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32} + else: + # cuda moe wna16 kernel + # set default block_size 128, and increase them when num_blocks + # is too large. + block_size_n = 128 + block_size_k = 128 + if block_size_k <= group_size: + block_size_k = group_size + + num_n_blocks = size_k // block_size_k + num_k_blocks = size_n // block_size_k + num_m_blocks = (num_valid_tokens + block_size_m - 1) / block_size_m + \ + num_experts + if num_valid_tokens // real_top_k <= block_size_m: + num_m_blocks = min(num_m_blocks, num_valid_tokens) + num_blocks = num_m_blocks * num_n_blocks * num_k_blocks + + if size_k % 256 == 0 and num_blocks >= 256 and \ + block_size_k < 256: + block_size_k = 256 + num_blocks = num_blocks // (256 // block_size_k) + + if num_m_blocks <= 16 and size_k % (block_size_k * 2) == 0 and \ + size_k % (block_size_k * 2) == 0 and block_size_k <= 512 and \ + num_blocks >= 512: + block_size_k = block_size_k * 2 + num_blocks = num_blocks // 2 + + if num_blocks > 1024: + block_size_n = 256 + num_n_blocks = num_n_blocks // 2 + num_blocks = num_blocks // 2 + + if size_n <= 1024 and num_blocks >= 1024: + # The kernel performance got much better with BLOCK_SIZE_N=1024 + # when num_blocks is large, event when N is small. + # Not sure why, maybe it force the CUDA SM process only one block + # at the same time. + block_size_n = 1024 + + return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k} + + +def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int, + num_experts: int, bit: int): + # return bit == 4 and group_size in [32, 64, 128] and \ + # num_valid_tokens / num_experts <= 6 + #暂时为False + return True + +def get_default_config( + M: int, + E: int, + N: int, + K: int, + topk: int, + dtype: Optional[str], + is_marlin: bool, + block_shape: Optional[List[int]] = None, + is_bottom: bool = False, +) -> Dict[str, int]: + if dtype == "fp8_w8a8" and block_shape is not None: + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] + # BLOCK_SIZE_K must be divisible by block_shape[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_shape[0], + "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": False, + "num_warps": 4, + "num_stages": 3, + } + elif dtype in ["int4_w4a16", "int8_w8a16"] and block_shape is not None: + # moe wna16 kernels + # only set BLOCK_SIZE_M + # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later + bit = 4 if dtype == "int4_w4a16" else 8 + use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk, + block_shape[1], E, bit) + if use_moe_wna16_cuda: + config = {"BLOCK_SIZE_M": min(16, M)} + elif M <= 20: + config = {"BLOCK_SIZE_M": 16, "GROUP_SIZE_M": 1, "COMBINE_SCALE_LOAD": False} + elif M <= 40: + config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1, "COMBINE_SCALE_LOAD": False} + else: + config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1, "COMBINE_SCALE_LOAD": False} + else: + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": False, + } + # A heuristic: fused marlin works faster with this config for small M + if M <= E or (is_marlin and M <= 32): + config = { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": False, + } + return config + + +def try_get_optimal_moe_config( + w1_shape: Tuple[int, ...], + w2_shape: Tuple[int, ...], + top_k: int, + dtype: Optional[str], + M: int, + is_marlin: bool = False, + block_shape: Optional[List[int]] = None, + is_bottom: bool = False, + use_moe_wna16_cuda: bool = False, +): + override_config = get_config() + if override_config: + config = override_config + else: + # First try to load optimal config from the file + E, _, N = w2_shape + if dtype == "int4_w4a16": + N = N * 2 + block_n = block_shape[0] if block_shape else 0 + block_k = block_shape[1] if block_shape else 0 + configs = get_moe_configs(E, N, dtype, block_n, block_k, is_bottom,use_moe_wna16_cuda) + + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Else use the default config + config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, + is_marlin, block_shape, is_bottom) + return config + +def try_get_optimal_moe_config_marlin( + w1_shape: Tuple[int, ...], + w2_shape: Tuple[int, ...], + top_k: int, + dtype: Optional[str], + M: int, + is_marlin: bool = False, + block_shape: Optional[List[int]] = None, + is_bottom: bool = False, + use_moe_wna16_cuda: bool = False, +): + override_config = get_config() + if override_config: + config = override_config + else: + # First try to load optimal config from the file + E, _, N = w2_shape + + if (dtype == "int4_w4a16" ) or ( dtype == "int8_w4a8"): + N = N * 2 + + block_n = block_shape[0] if block_shape else 0 + block_k = block_shape[1] if block_shape else 0 + configs = get_moe_configs_marlin(E, N, dtype, block_n, block_k, is_bottom,use_moe_wna16_cuda) + + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Else use the default config + config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, + is_marlin, block_shape, is_bottom) + return config + + +def fused_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + warmup:int = 0, + rep:int = 1, + renormalize: bool = False, +): + assert hidden_states.shape[0] == gating_output.shape[0], ( + "Number of tokens mismatch") + + M, _ = hidden_states.shape + + topk_weights = torch.empty(M, + topk, + dtype=torch.float32, + device=hidden_states.device) + topk_ids = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + token_expert_indicies = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + + # fused_moe_times = [] + + # with torch.inference_mode(): + fn = lambda: aiter.moe_c_topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) + fn() + # ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep) + # ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep) + # fused_moe_times.append(ms) + # Calculate statistics (skip warmup) + # fused_moe_times = fused_moe_times[warmup:] + # fused_moe_avg = ms #statistics.mean(fused_moe_times) # ms + + # topk_weights_size_bytes = topk_weights.numel() * topk_weights.element_size() + # topk_ids_size_bytes = topk_ids.numel() * topk_ids.element_size() + # token_expert_indicies_size_bytes = token_expert_indicies.numel() * token_expert_indicies.element_size() + # gating_output_size_bytes = gating_output.numel() * gating_output.element_size() + # data_size_bytes = topk_weights_size_bytes + topk_ids_size_bytes + token_expert_indicies_size_bytes + gating_output_size_bytes # 总字节数 + # data_size_gb = data_size_bytes / (1024 *1024*1024) # 转为GB + # bandwidth = (data_size_gb*1000) / fused_moe_avg # GB/s + + # del token_expert_indicies # Not used. Will be used in the future. + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + return topk_weights, topk_ids + + +# This is used by the Deepseek-V2 and Deepseek-V3 model +@torch.compile(dynamic=True, backend=get_compile_backend) +def grouped_topk(hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None): + + assert hidden_states.shape[0] == gating_output.shape[0], ( + "Number of tokens mismatch") + + if scoring_func == "softmax": + scores = torch.softmax(gating_output, dim=-1) + elif scoring_func == "sigmoid": + scores = gating_output.sigmoid() + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + num_token = scores.shape[0] + if e_score_correction_bias is not None: + # Store original scores before applying correction bias. We use biased + # scores for expert selection but original scores for routing weights + original_scores = scores + scores = scores + e_score_correction_bias.unsqueeze(0) + group_scores = (scores.view(num_token, num_expert_group, + -1).topk(2, dim=-1)[0].sum(dim=-1)) + else: + group_scores = scores.view(num_token, num_expert_group, + -1).max(dim=-1).values # [n, n_group] + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, + sorted=False)[1] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = group_mask.unsqueeze(-1).expand( + num_token, num_expert_group, + scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), + float("-inf")) # [n, e] + + if e_score_correction_bias is not None: + topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1] + # Use original unbiased scores for the routing weights + topk_weights = original_scores.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + return topk_weights.to(torch.float32), topk_ids.to(torch.int32) + + +def get_config_dtype_str(dtype: torch.dtype, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a16: Optional[bool] = False, + use_fp8_w8a8: Optional[bool] = False, + use_int8_w8a8: Optional[bool] = False, + use_int8_w4a8: Optional[bool] = False): + if use_fp8_w8a8: + return "fp8_w8a8" + elif use_int8_w8a8: + return "int8_w8a8" + elif use_int8_w4a8: + return "int8_w4a8" + elif use_int8_w8a16: + return "int8_w8a16" + elif use_int4_w4a16: + return "int4_w4a16" + elif dtype == torch.float: + # avoiding cases where kernel fails when float32 MoE + # use fp16/bfloat16 configs + return "float32" + return None + + +def inplace_fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + MODE1: int, + MODE2: int, + BM: int, + BN: int, + BK: int, + kloops: int, + nloops: int, + BN2: int , + BK2: int , + kloops2: int, + nloops2: int, + activation: Optional[str] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0) -> None: + if activation is None: + activation = "silu" + + + + if (use_int4_w4a16 or (use_int8_w8a8 and block_shape == None) or (use_fp8_w8a8 and block_shape == None) or (use_int8_w4a8 and block_shape == None) ): + + fused_experts_impl_marlin(hidden_states, w1, w2, topk_weights, topk_ids, MODE1, MODE2, BM, + True, activation, use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16, + use_int4_w4a16, use_int4_w4a16_base, global_num_experts, expert_map, + w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, + a2_scale, block_shape, routed_scaling_factor) + else: + fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids ,BM,BN,BK,kloops, nloops,BN2, + BK2,kloops2,nloops2,True, + activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, use_int4_w4a16_base, global_num_experts, expert_map, + w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, + block_shape, routed_scaling_factor) + + +def inplace_fused_experts_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0) -> None: + pass + + + +def outplace_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + MODE1: int, + MODE2: int, + BM: int, + BN: int, + BK: int, + kloops: int , + nloops: int, + BN2: int , + BK2: int , + kloops2: int, + nloops2: int, + activation: Optional[str] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0) -> torch.Tensor: + if activation is None: + activation = "silu" + + + if (use_int4_w4a16 or (use_int8_w8a8 and block_shape == None) or (use_fp8_w8a8 and block_shape == None) or (use_int8_w4a8 and block_shape == None) ): + return fused_experts_impl_marlin(hidden_states, w1, w2, topk_weights, topk_ids, MODE1, MODE2, BM, + False, activation, use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16, + use_int4_w4a16, use_int4_w4a16_base, global_num_experts, expert_map, + w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, + a2_scale, block_shape, routed_scaling_factor) + + + return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,BM,BN,BK,kloops,nloops,BN2, + BK2,kloops2,nloops2, + False, activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, use_int4_w4a16_base, global_num_experts, expert_map, + w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, + a2_scale, block_shape, routed_scaling_factor) + + +def outplace_fused_experts_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +@perftest(num_warmup=5, num_iters=10,testGraph=False) +def moe_c_fused_experts_bench(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + MODE1: int = 1, + MODE2: int = 1, + BM: int = 1, + BN: int = 1, + BK: int = 1, + kloops: int = 1, + nloops: int = 1, + BN2: int = 1, + BK2: int = 1, + kloops2: int = 1, + nloops2: int = 1, + inplace: bool = False, + activation: Optional[str] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None) -> torch.Tensor: + + return moe_c_fused_experts(hidden_states, + w1, + w2, + topk_weights, + topk_ids, + MODE1, + MODE2, + BM ,BN,BK, + kloops,nloops, + BN2, + BK2, + kloops2, + nloops2, + inplace=inplace, + activation=activation, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w4a8=use_int8_w4a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a16_base=use_int4_w4a16_base, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape) + +@torch_compile_guard() +def moe_c_fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + MODE1: int = 1, + MODE2: int = 1, + BM: int = 1, + BN: int = 1, + BK: int = 1, + kloops: int = 1, + nloops: int = 1, + BN2: int = 1, + BK2: int = 1, + kloops2: int = 1, + nloops2: int = 1, + inplace: bool = False, + activation: Optional[str] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0) -> torch.Tensor: + + # assert not (use_int8_w4a8 and hidden_states.shape[0] < 1024) , "only support M >= 1024" + + # print("*******************************use_int8_w4a8",use_int8_w4a8) + if activation is None: + activation = 'silu' + if inplace: + + inplace_fused_experts( + hidden_states, w1, w2, topk_weights, topk_ids,MODE1,MODE2,BM,BN,BK,kloops,nloops,BN2, + BK2,kloops2,nloops2,activation, + use_fp8_w8a8, use_int8_w8a8, use_int8_w4a8,use_int8_w8a16, use_int4_w4a16, use_int4_w4a16_base, global_num_experts, + expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, + block_shape, routed_scaling_factor) + # print("hidden_states",hidden_states) + return hidden_states + else: + return outplace_fused_experts( + hidden_states, w1, w2, topk_weights, topk_ids,MODE1,MODE2,BM,BN,BK,kloops,nloops,BN2, + BK2,kloops2,nloops2,activation, + use_fp8_w8a8, use_int8_w8a8,use_int8_w4a8, use_int8_w8a16, use_int4_w4a16, use_int4_w4a16_base, global_num_experts, + expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, + block_shape, routed_scaling_factor) + + + +# def moe_sum(input_tensor: torch.tensor, output_tensor: torch.tensor, ): +# torch.ops.sgl_kernel.moe_sum.default(input_tensor, output_tensor, ) + + + +def fused_experts_impl_channelwise_w8a8(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_new: torch.Tensor, + w2_new: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + BM: int, + BN: int, + BK: int, + kloops: int, + nloops: int, + BN2: int , + BK2: int , + kloops2: int, + nloops2: int, + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None): + + m = hidden_states.shape[0] + topk = topk_ids.shape[1] + + e, n1, _ = w1.shape + if global_num_experts == -1: + global_num_experts = e + per_channel_quant = True + + + if inplace: + out_hidden_states = hidden_states + else: + + out_hidden_states = torch.empty_like(hidden_states) + + + + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(topk_ids, 16, global_num_experts, expert_map) + ) + + + qinput1, qa1_scale = moe_kernel_prepare_input( + A=hidden_states, + B=w1, + A_scale=None, + B_scale=w1_scale, + use_fp8_w8a8=False, + use_int8_w8a8=True, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=per_channel_quant, + block_shape=None + ) + + cache13 = torch.empty(m * topk * max(n1, w2.shape[1]), + device=hidden_states.device, + dtype=hidden_states.dtype) + intermediate_cache1 = cache13[:m * topk * n1].view( + (m, topk_ids.shape[1], n1)) + intermediate_cache3 = cache13[:m * topk * w2.shape[1]].view( + (m, topk_ids.shape[1], w2.shape[1])) + + + # # intermediate_cache1 = torch.zeros((m, topk, n1), device=hidden_states.device, dtype=hidden_states.dtype) + + + aiter.moe_c_moe_gemm_marlin_w8a8(qinput1, w1_new, intermediate_cache1, qa1_scale, w1_scale, None, + sorted_token_ids, expert_ids, num_tokens_post_padded,topk, 54, 1) + intermediate_cache2 = torch.empty((m * topk, n1 // 2), device=hidden_states.device, dtype=hidden_states.dtype) + + + + # # torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, n1)) + + moe_c_silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, n1)) + + # torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, n1)) + # print("moe_kernel_prepare_input************************************") + # print(intermediate_cache1) + # print(intermediate_cache2) + # print(w2) + # intermediate_cache2 = intermediate_cache2 /51380224 + # print(intermediate_cache2) + # start = time.perf_counter() + + qinput2, qa2_scale = moe_kernel_prepare_input( + A=intermediate_cache2, + B=w2, + A_scale=None, + B_scale=w2_scale, + use_fp8_w8a8=False, + use_int8_w8a8=True, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=per_channel_quant, + block_shape=None + ) + + + + aiter.moe_c_moe_gemm_marlin_w8a8(qinput2, w2_new, intermediate_cache3, qa2_scale, w2_scale, topk_weights, + sorted_token_ids, expert_ids, num_tokens_post_padded, 1, 54, 1) + + + # print(intermediate_cache3) + mode_use_triton_moe_sum = out_hidden_states.dtype == torch.float16 or \ + out_hidden_states.dtype == torch.bfloat16 or \ + out_hidden_states.dtype == torch.float32 + + mode_use_triton_moe_sum = True + + if mode_use_triton_moe_sum: + + triton_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states) + else: + moe_c_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states,topk_ids) + + + return out_hidden_states + + +def fused_experts_impl_marlin(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + MODE1: int, + MODE2: int, + BM: int, + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0): + # Check constraints. + if use_int4_w4a16 or use_int8_w4a8: + assert hidden_states.shape[1] // 2 == w1.shape[ + 2], "Hidden size mismatch" + else: + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16 + ] + + num_tokens, _ = hidden_states.shape + E, N, _ = w1.shape + if global_num_experts == -1: + global_num_experts = E + top_k_num = topk_ids.shape[1] + # We execute the fused_moe kernel in chunks to circumvent this issue: + CHUNK_SIZE = 32768 + + M = min(num_tokens, CHUNK_SIZE) + config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w4a8=use_int8_w4a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + dtype=hidden_states.dtype) + + get_config_func = functools.partial( + try_get_optimal_moe_config_marlin, + w1.shape, + w2.shape, + top_k_num, + config_dtype, + block_shape=block_shape, + ) + # use_moe_wna16_cuda = should_moe_wna16_use_cuda( + # num_valid_tokens=topk_ids.numel(), + # group_size=block_shape[1] if block_shape else 0, + # num_experts=w1.shape[0], + # bit=4 if use_int4_w4a16 else 8) + use_moe_wna16_cuda = True + if use_moe_wna16_cuda and config_dtype in ["int4_w4a16","int8_w8a8","fp8_w8a8","int8_w8a16", "use_int8_w4a8"] : + config = get_config_func(M,use_moe_wna16_cuda=use_moe_wna16_cuda) + else: + config = get_config_func(M) + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]), + device=hidden_states.device, + dtype=hidden_states.dtype) + intermediate_cache1 = cache13[:M * top_k_num * N].view( + (M, topk_ids.shape[1], N)) + intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view( + (M, topk_ids.shape[1], w2.shape[1])) + + # This needs separate memory since it's used concurrently with cache1 + intermediate_cache2 = torch.empty((M * top_k_num, N // 2), + device=hidden_states.device, + dtype=hidden_states.dtype) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + else: + raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") + + + + + if inplace: + out_hidden_states = hidden_states + # out_hidden_states = torch.empty_like(hidden_states) + else: + out_hidden_states = torch.empty_like(hidden_states) + + + + for chunk in range((num_tokens // CHUNK_SIZE) + 1): + begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE, + min((chunk + 1) * CHUNK_SIZE, + num_tokens)) + curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] + tokens_in_chunk, _ = curr_hidden_states.shape + + if tokens_in_chunk == 0: + break + + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + # Adjust the intermediate cache size and config for the last + # chunk. Note that in most cases we only have one chunk + # so the cache size and config are already set correctly and + # do not need to be adjusted. + intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] + intermediate_cache2 = intermediate_cache2[:tokens_in_chunk * + topk_ids.shape[1]] + intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] + config = get_config_func(tokens_in_chunk) + + curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] + curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] + + + find_best = os.environ.get("WHICH_TO_TEST") + if(find_best): + if(use_int4_w4a16 or use_int8_w4a8): + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(curr_topk_ids, BM, + global_num_experts, expert_map)) + else: + sorted_token_ids,_,expert_ids, num_tokens_post_padded,_,_ = moe_sorting_ck( + curr_topk_ids, + topk_weights, + global_num_experts, + 7168, + hidden_states.dtype, + BM, + expert_mask=None, + ) + + else: + if(use_int4_w4a16 or use_int8_w4a8): + # print("*****************************",config["BLOCK_SIZE_M"]) + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(curr_topk_ids, config["BLOCK_SIZE_M"], + global_num_experts, expert_map)) + # print("*********************************moec:") + # print(sorted_token_ids.dtype) + # print(sorted_token_ids.tolist()) + # print(expert_ids) + else: + sorted_token_ids,_,expert_ids, num_tokens_post_padded,_,_ = moe_sorting_ck( + curr_topk_ids, + topk_weights, + global_num_experts, + 7168, + hidden_states.dtype, + config["BLOCK_SIZE_M"], + expert_mask=None, + ) + # print("*********************************ck:") + # print(sorted_token_ids.dtype) + # print(sorted_token_ids.tolist()) + # print(expert_ids) + # print("*****************************************************8",sorted_token_ids.tolist()) + + # print("*****************************************************8",sorted_token_ids[-100:-1].tolist()) + + # if(use_int8_w8a8 and block_shape[0] == 1): + + invoke_fused_moe_kernel_marlin(curr_hidden_states, + w1, + intermediate_cache1, + a1_scale, + w1_scale, + w1_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + False, + top_k_num, + MODE1, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w4a8=use_int8_w4a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a16_base=use_int4_w4a16_base, + is_bottom = False, + block_shape=block_shape) + + if activation == "silu": + moe_c_silu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) + # elif activation == "gelu": + # torch.ops._C.gelu_and_mul(intermediate_cache2, + # intermediate_cache1.view(-1, N)) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + # use_moe_wna16_cuda = should_moe_wna16_use_cuda( + # num_valid_tokens=topk_ids.numel(), + # group_size=block_shape[1] if block_shape else 0, + # num_experts=w2.shape[0], + # bit=4 if use_int4_w4a16 else 8) + use_moe_wna16_cuda = True + + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + config = get_config_func(tokens_in_chunk, is_bottom=True) + else: + if use_moe_wna16_cuda and config_dtype in ["int4_w4a16","int8_w8a8","fp8_w8a8","int8_w8a16", "int8_w4a8"] : + config = get_config_func(M,is_bottom=True,use_moe_wna16_cuda=use_moe_wna16_cuda) + else: + config = get_config_func(M, is_bottom=True) + + + invoke_fused_moe_kernel_marlin(intermediate_cache2, + w2, + intermediate_cache3, + a2_scale, + w2_scale, + w2_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + True, + top_k_num, + MODE2, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w4a8=use_int8_w4a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a16_base=use_int4_w4a16_base, + is_bottom = True, + block_shape=block_shape) + mode_use_triton_moe_sum = out_hidden_states.dtype == torch.float16 or \ + out_hidden_states.dtype == torch.bfloat16 or \ + out_hidden_states.dtype == torch.float32 + + mode_use_triton_moe_sum = True + + if mode_use_triton_moe_sum: + # triton_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), + # out_hidden_states[begin_chunk_idx:end_chunk_idx]) + triton_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states[begin_chunk_idx:end_chunk_idx] , routed_scaling_factor) + else: + moe_c_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx],curr_topk_ids) + + + + return out_hidden_states + + + + + + + + + + + + + + + + +def fused_experts_impl(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + BM: int, + BN: int, + BK: int, + kloops: int, + nloops: int, + BN2: int , + BK2: int , + kloops2: int, + nloops2: int, + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0): + + # Check constraints. + if use_int4_w4a16: + assert hidden_states.shape[1] // 2 == w1.shape[ + 2], "Hidden size mismatch" + else: + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16 + ] + + num_tokens, _ = hidden_states.shape + E, N, _ = w1.shape + if global_num_experts == -1: + global_num_experts = E + top_k_num = topk_ids.shape[1] + # We execute the fused_moe kernel in chunks to circumvent this issue: + CHUNK_SIZE = 32768 + + M = min(num_tokens, CHUNK_SIZE) + config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + dtype=hidden_states.dtype) + + get_config_func = functools.partial( + try_get_optimal_moe_config, + w1.shape, + w2.shape, + top_k_num, + config_dtype, + block_shape=block_shape, + ) + use_moe_wna16_cuda = should_moe_wna16_use_cuda( + num_valid_tokens=topk_ids.numel(), + group_size=block_shape[1], + num_experts=w1.shape[0], + bit=4 if use_int4_w4a16 else 8) + if use_moe_wna16_cuda and config_dtype in ["int4_w4a16","int8_w8a8","fp8_w8a8","int8_w8a16"] : + config = get_config_func(M,use_moe_wna16_cuda=use_moe_wna16_cuda) + else: + config = get_config_func(M) + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]), + device=hidden_states.device, + dtype=hidden_states.dtype) + intermediate_cache1 = cache13[:M * top_k_num * N].view( + (M, topk_ids.shape[1], N)) + intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view( + (M, topk_ids.shape[1], w2.shape[1])) + + # This needs separate memory since it's used concurrently with cache1 + intermediate_cache2 = torch.empty((M * top_k_num, N // 2), + device=hidden_states.device, + dtype=hidden_states.dtype) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + else: + raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") + + if inplace: + out_hidden_states = hidden_states + else: + out_hidden_states = torch.empty_like(hidden_states) + + for chunk in range((num_tokens // CHUNK_SIZE) + 1): + begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE, + min((chunk + 1) * CHUNK_SIZE, + num_tokens)) + curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] + tokens_in_chunk, _ = curr_hidden_states.shape + + if tokens_in_chunk == 0: + break + + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + # Adjust the intermediate cache size and config for the last + # chunk. Note that in most cases we only have one chunk + # so the cache size and config are already set correctly and + # do not need to be adjusted. + intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] + intermediate_cache2 = intermediate_cache2[:tokens_in_chunk * + topk_ids.shape[1]] + intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] + config = get_config_func(tokens_in_chunk) + + curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] + curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] + + + find_best = os.environ.get("WHICH_TO_TEST") + if(find_best): + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(curr_topk_ids, BM, + global_num_experts, expert_map)) + else: + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(curr_topk_ids, config["BLOCK_SIZE_M"], + global_num_experts, expert_map)) + + # if(use_int8_w8a8 and block_shape[0] == 1): + + invoke_fused_moe_kernel(curr_hidden_states, + w1, + w1, #需要修改 调用链路 只需要传递shuffle权重即可 w8a8_per_token已修改 + intermediate_cache1, + a1_scale, + w1_scale, + w1_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + False, + top_k_num, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a16_base=use_int4_w4a16_base, + BM = BM, + BN = BN, + BK = BK, + kloops = kloops, + nloops = nloops, + is_bottom = False, + block_shape=block_shape) + + if activation == "silu": + moe_c_silu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) + # elif activation == "gelu": + # torch.ops._C.gelu_and_mul(intermediate_cache2, + # intermediate_cache1.view(-1, N)) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + use_moe_wna16_cuda = should_moe_wna16_use_cuda( + num_valid_tokens=topk_ids.numel(), + group_size=block_shape[1], + num_experts=w2.shape[0], + bit=4 if use_int4_w4a16 else 8) + + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + config = get_config_func(tokens_in_chunk, is_bottom=True) + else: + if use_moe_wna16_cuda and config_dtype in ["int4_w4a16","int8_w8a8","fp8_w8a8","int8_w8a16"] : + config = get_config_func(M,is_bottom=True,use_moe_wna16_cuda=use_moe_wna16_cuda) + else: + config = get_config_func(M, is_bottom=True) + + # intermediate_cache2 = torch.ones_like(intermediate_cache2) + + invoke_fused_moe_kernel(intermediate_cache2, + w2, + w2, #需要修改 调用链路 只需要传递shuffle权重即可 w8a8_per_token已修改 + intermediate_cache3, + a2_scale, + w2_scale, + w2_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + True, + 1, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a16_base=use_int4_w4a16_base, + BM = BM, + BN = BN2, + BK = BK2, + kloops = kloops2, + nloops = nloops2, + is_bottom = True, + block_shape=block_shape) + + mode_use_triton_moe_sum = out_hidden_states.dtype == torch.float16 or \ + out_hidden_states.dtype == torch.bfloat16 or \ + out_hidden_states.dtype == torch.float32 + + mode_use_triton_moe_sum = True + + if mode_use_triton_moe_sum: + triton_moe_sum_noaiter(intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx]) + else: + moe_c_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx],curr_topk_ids) + + + + return out_hidden_states + + +def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + MODE1: int = 1, + MODE2: int = 1, + BM: int = 1, + BN: int = 1, + BK: int = 1, + kloops: int = 1, + nloops: int = 1, + BN2: int = 1, + BK2: int = 1, + kloops2: int = 1, + nloops2: int = 1, + inplace: bool = False, + activation: str = "silu", + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w4a8 : bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a16_base: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, +) -> torch.Tensor: + """ + This function computes a Mixture of Experts (MoE) layer using two sets of + weights, w1 and w2, and top-k gating mechanism. + + Parameters: + - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - gating_output (torch.Tensor): The output of the gating operation + (before softmax). + - topk (int): The number of top-k experts to select. + - renormalize (bool): If True, renormalize the top-k weights to sum to 1. + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - activation (str): The activation function to apply after the first + MoE layer. + - num_expert_group: Optional[int]: additional parameter for grouped_topk + - topk_group: Optional[int]: additional parameter for grouped_topk + - use_grouped_topk: If True, use grouped_topk instead of fused_topk + note: Deepseekv2 model uses grouped_topk + - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner + products for w1 and w2. Defaults to False. + - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16 + activation to compute the inner products for w1 and w2. + Defaults to False. + - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner + products for w1 and w2. Defaults to False. + - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16 + activation to compute the inner products for w1 and w2. + Defaults to False. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - w1_scale (Optional[torch.Tensor]): Optional scale to be used for + w1. + - w2_scale (Optional[torch.Tensor]): Optional scale to be used for + w2. + - a1_scale (Optional[torch.Tensor]): Optional scale to be used for + a1. + - a2_scale (Optional[torch.Tensor]): Optional scale to be used for + a2. + - block_shape: (Optional[List[int]]): Optional block size for block-wise + quantization. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + if use_grouped_topk: + assert num_expert_group is not None and topk_group is not None + topk_weights, topk_ids = grouped_topk(hidden_states, gating_output, + topk, renormalize, + num_expert_group, topk_group) + elif custom_routing_function is None: + + topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, + renormalize) + else: + topk_weights, topk_ids = custom_routing_function( + hidden_states, gating_output, topk, renormalize) + return moe_c_fused_experts(hidden_states, + w1, + w2, + topk_weights, + topk_ids, + MODE1, + MODE2, + BM ,BN,BK, + kloops,nloops, + BN2, + BK2, + kloops2, + nloops2, + inplace=inplace, + activation=activation, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w4a8 = use_int8_w4a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a16_base=use_int4_w4a16_base, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape) diff --git a/aiter/fused_moe_ck.py b/aiter/fused_moe_ck.py new file mode 100644 index 0000000000000000000000000000000000000000..a7563e6d37b89335189ad024f8a6bd33afb8760e --- /dev/null +++ b/aiter/fused_moe_ck.py @@ -0,0 +1,740 @@ +import torch +import os +from typing import Optional, List +import functools +from bisect import bisect_left +import aiter +from aiter import ActivationType, QuantType, dtypes +from aiter.jit.core import AITER_ROOT_DIR +from aiter import ck_moe, ck_shuffle_moe +from aiter.jit.utils.torch_guard import torch_compile_guard +from aiter.jit.utils.chip_info import get_gfx +from aiter.fused_moe import moe_sorting +from aiter import per_token_quant_hip, per_block_quant_wrapper + + +BLOCK_SIZE_M = 32 + +class MoeQuantType: + NO_QUANT = "no_quant" + INT4_W4A16 = "int4_w4a16" + INT4_W4A8 = "int4_w4a8" + INT8_W8A8 = "int8_w8a8_block" + INT8_W8A8_C = "int8_w8a8_channel" + + ALL_TYPES = [NO_QUANT, INT4_W4A16, INT4_W4A8, INT8_W8A8, INT8_W8A8_C] + + @classmethod + def is_valid(cls, qtype_str: str) -> bool: + return qtype_str in cls.ALL_TYPES + + @classmethod + def get_default(cls) -> str: + return cls.NO_QUANT + + +ck_tuned_file = os.path.join(AITER_ROOT_DIR, "aiter", "configs", "ck_tune", "tuned_fmoe_ck.csv") +ck_tuned_int8_w8a8_group_file = os.path.join(AITER_ROOT_DIR, "aiter", "configs", "ck_tune", "tuned_fmoe_ck_int8_w8a8_group.csv") + +moe_ck_cfg = None +moe_ck_noquant_cfg = None +moe_ck_int8_w8a8_group_cfg = None +moe_ck_noquant_index = None +moe_ck_int8_w8a8_group_index = None +current_quant_type = None + +def get_moe_ck_solution( + indtype, + token, + inter_dim, + model_dim, + expert, + topk, + quant_type, + q_size_n=0, + q_size_k=0 +): + + def get_moe_cfg(ck_tuned_file): + import pandas as pd + try: + moe_cfg = pd.read_csv(ck_tuned_file) + except Exception as e: + print(f">>> Warning: Failed to read config file {ck_tuned_file}: {e}") + return None + return moe_cfg + + global moe_ck_cfg + if moe_ck_cfg is None: + moe_ck_cfg = get_moe_cfg(ck_tuned_file) + if moe_ck_cfg is None: + print(f">>> Warning: config file {ck_tuned_file} is not found, using default ck solution.") + return functools.partial(ck_moe, solution_id = 0) + + mask = ( + (moe_ck_cfg["indtype"] == str(indtype)) & + (moe_ck_cfg["inter_dim"] == inter_dim) & + (moe_ck_cfg["model_dim"] == model_dim) & + (moe_ck_cfg["expert"] == expert) & + (moe_ck_cfg["topk"] == topk) & + (moe_ck_cfg["quant_type"] == str(quant_type)) & + (moe_ck_cfg["q_size_n"] == q_size_n) & + (moe_ck_cfg["q_size_k"] == q_size_k) + ) + matching_configs = moe_ck_cfg[mask] + if matching_configs.empty: + sol_id = 0 + print(f">>> Warning: No matching config pattern found, using default ck solution.") + return functools.partial(ck_moe, solution_id=sol_id) + + # 1. 精确匹配 token + exact_match = matching_configs[matching_configs["token"] == token] + if not exact_match.empty: + sol_id = int(exact_match.iloc[0]["sol_id"]) + print(f">>> Info: Exact token match found for token={token}, using sol_id={sol_id}.") + return functools.partial(ck_moe, solution_id=sol_id) + + # 2. 找最接近的 token + matching_configs["token_distance"] = abs(matching_configs["token"] - token) + closest_match = matching_configs.loc[matching_configs["token_distance"].idxmin()] + + closest_token = closest_match["token"] + distance = closest_match["token_distance"] + sol_id = int(closest_match["sol_id"]) + + print(f">>> Info: Closest token match found: token={closest_token} (distance={distance}) for target token={token}, using sol_id={sol_id}.") + return functools.partial(ck_moe, solution_id=sol_id) + + +def build_moe_index(df): + """Convert the tuning table into a pure-Python lookup structure.""" + moe_index = {} + for row in df.itertuples(index=False): + key = ( + row.arch, + int(row.inter_dim), + int(row.model_dim), + int(row.expert), + int(row.topk), + str(row.quant_type), + int(row.q_size_n), + int(row.q_size_k), + ) + entry = moe_index.get(key) + if entry is None: + entry = {"token_to_sol": {}, "tokens": []} + moe_index[key] = entry + token_val = int(row.token) + entry["token_to_sol"][token_val] = int(row.sol_id) + entry["tokens"].append(token_val) + + for entry in moe_index.values(): + entry["tokens"].sort() + + return moe_index + + +def _find_closest_token(sorted_tokens, target_token): + idx = bisect_left(sorted_tokens, target_token) + if idx == 0: + return sorted_tokens[0] + if idx == len(sorted_tokens): + return sorted_tokens[-1] + + before = sorted_tokens[idx - 1] + after = sorted_tokens[idx] + if (target_token - before) <= (after - target_token): + return before + return after + +def get_moe_ck_solution_id( + arch, + quant_type, + token, + inter_dim, + model_dim, + expert, + topk, + q_size_n=0, + q_size_k=0 +): + def get_moe_cfg(ck_tuned_file): + import pandas as pd + try: + moe_cfg = pd.read_csv(ck_tuned_file) + except Exception as e: + print(f">>> Warning: Failed to read config file {ck_tuned_file}: {e}") + return None + return moe_cfg + + global moe_ck_cfg, current_quant_type + global moe_ck_noquant_cfg, moe_ck_int8_w8a8_group_cfg + global moe_ck_noquant_index, moe_ck_int8_w8a8_group_index + + current_index = None + + if moe_ck_cfg is None or quant_type != current_quant_type: + if quant_type == MoeQuantType.INT8_W8A8: + if moe_ck_int8_w8a8_group_cfg is None: + moe_ck_int8_w8a8_group_cfg = get_moe_cfg(ck_tuned_int8_w8a8_group_file) + if moe_ck_int8_w8a8_group_cfg is not None: + moe_ck_int8_w8a8_group_index = build_moe_index(moe_ck_int8_w8a8_group_cfg) + moe_ck_cfg = moe_ck_int8_w8a8_group_cfg + elif quant_type == MoeQuantType.NO_QUANT: + if moe_ck_noquant_cfg is None: + moe_ck_noquant_cfg = get_moe_cfg(ck_tuned_file) + if moe_ck_noquant_cfg is not None: + moe_ck_noquant_index = build_moe_index(moe_ck_noquant_cfg) + moe_ck_cfg = moe_ck_noquant_cfg + else: + print(f">>> Warning: quant_type {quant_type} not supported for CK lookup, fallback to no-quant table.") + if moe_ck_noquant_cfg is None: + moe_ck_noquant_cfg = get_moe_cfg(ck_tuned_file) + if moe_ck_noquant_cfg is not None: + moe_ck_noquant_index = build_moe_index(moe_ck_noquant_cfg) + moe_ck_cfg = moe_ck_noquant_cfg + quant_type = MoeQuantType.NO_QUANT + + current_quant_type = quant_type + + if quant_type == MoeQuantType.INT8_W8A8: + current_index = moe_ck_int8_w8a8_group_index + else: + current_index = moe_ck_noquant_index + + if moe_ck_cfg is None: + print(f">>> Warning: config file is not found, using default ck solution.") + return 0 + if current_index is None: + print(f">>> Warning: ck index is not built, using default ck solution.") + return 0 + + key = (arch, inter_dim, model_dim, expert, topk, str(quant_type), q_size_n, q_size_k) + candidates = current_index.get(key) + + + if not candidates: + print(f">>> Warning: No matching config pattern found for key={key}, using default ck solution.") + return 0 + + + # 1. 精确匹配 token + token = int(token) + token_to_sol = candidates["token_to_sol"] + sol_id = token_to_sol.get(token) + if sol_id is not None: + return int(sol_id) + + # 2. 找最接近的 token + closest_token = _find_closest_token(candidates["tokens"], token) + sol_id = token_to_sol[closest_token] + return int(sol_id) + + +def ck_moe_stage_1( + hidden_states, + w1, # [E, inter_dim*2, model_dim] + w2, # [E, model_dim, inter_dim] + sorted_token_ids, # [max_num_tokens_padded] + sorted_expert_ids, # [max_num_m_blocks] + tokens_positions_per_expert, # [num_experts*2] + num_valid_ids, # [1] + use_int8_w8a8_block: bool, + use_fp8_w8a8_block: bool, + w1_scale, + a1_scale, + dtype, + topk, + block_shape_n=0, + block_shape_k=0, + block_size=16, + Activation=ActivationType.Silu, + sorted_weights=None, # [max_num_tokens_padded] +): + token_num = hidden_states.shape[0] + D = w1.shape[1] + # max_num_tokens_padded = sorted_expert_ids.shape[0]*block_size + if Activation == ActivationType.Silu: + act_op = 1 + else: + act_op = 0 + + if w1.dtype is torch.uint32: + D = D * 8 + + gemm_out_type = torch.float16 + # for now, ck_moe_stage_1 has not do the activation inside, so 'D = 2 * inter_dim' + # out = torch.empty((token_num * topk, D), dtype=gemm_out_type, device=hidden_states.device) + out = torch.empty((token_num * topk, D//2), dtype=gemm_out_type, device=hidden_states.device) + + aiter.ck_moe_stage_1( + hidden_states, + w1, + w2, + sorted_token_ids, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + out, + topk, + use_int8_w8a8_block, + use_fp8_w8a8_block, + w1_scale, + a1_scale, + block_shape_n, + block_shape_k, + block_size, + sorted_weights, + act_op, + ) + + # silu and multiply + # silu_out = torch.empty((token_num * topk, D // 2), dtype=dtype, device=hidden_states.device) + # aiter.silu_and_mul(silu_out, out.to(dtype)) + # return silu_out + + return out.to(dtype) + +def ck_moe_stage_2( + hidden_states, + w1, # [E, inter_dim*2, model_dim] + w2, # [E, model_dim, inter_dim] + sorted_token_ids, # [max_num_tokens_padded] + sorted_expert_ids, # [max_num_m_blocks] + tokens_positions_per_expert, # [num_experts*2] + num_valid_ids, # [1] + use_int8_w8a8_block: bool, + use_fp8_w8a8_block: bool, + w2_scale, + a2_scale, + dtype, + topk, + block_shape_n=0, + block_shape_k=0, + block_size=16, + sorted_weights=None, # [max_num_tokens_padded] + moe_buf=None, # [token_num, model_dim] +): + hidden_states.reshape(-1, hidden_states.shape[-1]) + + if moe_buf is None: + out = torch.zeros( # must be zeros, because use atomic add inside + (hidden_states.shape[0]//topk, w2.shape[1]), # [token_num, model_dim] + dtype=dtypes.fp32, # gpu not support fp16 atomic add + device=hidden_states.device, + ) + else: + out = moe_buf + + aiter.ck_moe_stage_2( + hidden_states, + w1, + w2, + sorted_token_ids, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + out, + topk, + use_int8_w8a8_block, + use_fp8_w8a8_block, + w2_scale, + a2_scale, + block_shape_n, + block_shape_k, + block_size, + sorted_weights, + ) + return out.to(dtype) + + +def fused_moe_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_int8_w8a16: Optional[bool] = False, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a8_block: Optional[bool] = False, + use_int4_w4a8_block: Optional[bool] = False, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + solution_id: Optional[int] = 0, + expert_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + + device = topk_ids.device + M, topk = topk_ids.shape + # dtype = dtype + # E, model_dim, inter_dim = get_inter_dim(w1.shape, w2.shape) + # FIXME: W2.size must be same as hidden_dim + moe_buf = torch.empty((M, w2.size(1)), dtype=torch.float32, device=device) + return moe_buf + + +@torch_compile_guard(gen_fake=fused_moe_fake) +def ck_fused_experts_2stage_impl(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + odtype:torch.dtype, #compute or output type for i8& f8 + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a8: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + solution_id: Optional[int] = None)-> torch.Tensor: + + num_tokens, _ = hidden_states.shape + E, N, _ = w1.shape + _, model_dim, inter_dim = w2.shape + top_k_num = topk_ids.shape[1] + quant_block_n, quant_block_k = block_shape[0],block_shape[1] if block_shape is not None else (0,0) + + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf = moe_sorting( + topk_ids, topk_weights, E, model_dim, torch.float32, BLOCK_SIZE_M + ) + + # print(f"########### token_per_expert: {tokens_positions_per_expert}") + + if use_int8_w8a8: + if per_channel_quant: + print(">>> ck fused moe int8 w8a8 per channel not supported yet.") + return None + else: # block scale + + # quantization input if needed + if hidden_states.dtype == torch.float16 or hidden_states.dtype==torch.bfloat16: + input_q, input_scale = per_block_quant_wrapper((1, quant_block_k))(per_token_quant_hip)(hidden_states, quant_dtype=torch.int8) + else: + input_q, input_scale = hidden_states, a1_scale + + out_st1 = ck_moe_stage_1( + input_q, # 暂时由外部quant input + w1, + w2, + sorted_ids, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + True, + False, + w1_scale, + input_scale, + odtype, # fp16/bf16 compute + top_k_num, + block_shape_n=quant_block_n, + block_shape_k=quant_block_k, + block_size=BLOCK_SIZE_M, + Activation=ActivationType.Silu if activation=="silu" else ActivationType.Gelu, + sorted_weights=None) # stage1不处理topk weights + + + # quantization stage1 output + out_st1 = out_st1.reshape(-1, out_st1.shape[-1]) + + bridge_q, bridge_scale = per_block_quant_wrapper((1, quant_block_k))(per_token_quant_hip)(out_st1, quant_dtype=torch.int8) + + out = ck_moe_stage_2( + bridge_q, + w1, + w2, + sorted_ids, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + True, + False, + w2_scale, + bridge_scale, + odtype, # fp16/bf16 compute + top_k_num, + block_shape_n=quant_block_n, + block_shape_k=quant_block_k, + block_size=BLOCK_SIZE_M, + sorted_weights=sorted_weights, # stage2处理topk weights + moe_buf=moe_buf + ) + + # return (out, out_st1) + return out + + elif use_fp8_w8a8: + if per_channel_quant: + print(">>> ck fused moe fp8 w8a8 per channel not supported yet.") + return None + else: + # quantization input if needed + if hidden_states.dtype == torch.float16 or hidden_states.dtype==torch.bfloat16: + input_q, input_scale = per_block_quant_wrapper((1, quant_block_k))(per_token_quant_hip)(hidden_states, quant_dtype=torch.float8_e4m3fn) + else: + input_q, input_scale = hidden_states, a1_scale + + out_st1 = ck_moe_stage_1( + input_q, # 暂时由外部quant input + w1, + w2, + sorted_ids, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + False, + True, + w1_scale, + input_scale, + odtype, # fp16/bf16 compute + top_k_num, + block_shape_n=quant_block_n, + block_shape_k=quant_block_k, + block_size=BLOCK_SIZE_M, + Activation=ActivationType.Silu if activation=="silu" else ActivationType.Gelu, + sorted_weights=None) # stage1不处理topk weights + + + # quantization stage1 output + out_st1 = out_st1.reshape(-1, out_st1.shape[-1]) + + bridge_q, bridge_scale = per_block_quant_wrapper((1, quant_block_k))(per_token_quant_hip)(out_st1, quant_dtype=torch.float8_e4m3fn) + + out = ck_moe_stage_2( + bridge_q, + w1, + w2, + sorted_ids, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + False, + True, + w2_scale, + bridge_scale, + odtype, # fp16/bf16 compute + top_k_num, + block_shape_n=quant_block_n, + block_shape_k=quant_block_k, + block_size=BLOCK_SIZE_M, + sorted_weights=sorted_weights, # stage2处理topk weights + moe_buf=moe_buf + ) + + # return (out, out_st1) + return out + + else: + return None + +@torch_compile_guard(gen_fake=fused_moe_fake) +def ck_fused_experts_1stage_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + odtype:torch.dtype, #compute or output type for i8& f8 + use_int8_w8a16: Optional[bool] = False, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a8_block: Optional[bool] = False, + use_int4_w4a8_block: Optional[bool] = False, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + use_shuffle: Optional[bool] = False, + solution_id: Optional[int] = 0, + expert_mask: Optional[torch.Tensor] = None)-> torch.Tensor: + + if use_shuffle and use_shuffle==True: + out = ck_shuffle_moe( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + use_int8_w8a16, + use_int4_w4a16, + use_int8_w8a8_block, + use_int4_w4a8_block, + w1_zp, + w2_zp, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + block_shape_n, + block_shape_k, + block_m, + solution_id, + expert_mask) + return out.to(odtype) + else: + out = ck_moe( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + use_int8_w8a16, + use_int4_w4a16, + use_int8_w8a8_block, + use_int4_w4a8_block, + w1_zp, + w2_zp, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + block_shape_n, + block_shape_k, + block_m, + solution_id, + expert_mask) + return out.to(odtype) + + # sum_out = torch.empty_like(hidden_states, dtype=out.dtype, device=out.device) + # moe_sum(out, sum_out) + # return sum_out.to(odtype) + +def bits30_31(solution_id: int) -> int: + unsigned32 = solution_id & 0xFFFFFFFF # treat as 32-bit two’s complement + return (unsigned32 & 0xC0000000) >> 30 # 0xC0000000 = bits 31–30 set + +# The outside interface +def run_fused_experts_ck_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + odtype:torch.dtype, #compute or output type for i8& f8 + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a8: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + block_m: int = BLOCK_SIZE_M, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + use_shuffle: Optional[bool] = False, + routed_scaling_factor: Optional[float] = 1.0, + solution_id: Optional[int] = None)-> torch.Tensor: + + if solution_id == None: + if use_shuffle and use_shuffle==True: #only one stage supports shuffle for now. + solution_id = 0 + else: + # solution_id = 0 + arch = get_gfx() + quantType = MoeQuantType.NO_QUANT + if use_int8_w8a8: + quantType = MoeQuantType.INT8_W8A8 + + E, model_dim, inter_dim = w2.shape + topk = topk_ids.shape[1] + + if quantType == MoeQuantType.INT8_W8A8 and block_shape[1] == 64: + solution_id = 1 << 30 # only two stage supports block_shape_k = 64 + + else: + solution_id = get_moe_ck_solution_id( + arch, + quantType, + hidden_states.shape[0], + inter_dim, # inter_dim + model_dim, + E, + topk, + block_shape[0] if block_shape is not None else 0, + block_shape[1] if block_shape is not None else 0 + ) + + + solutionType = bits30_31(solution_id) + + # two stage + if solutionType == 1: + return ck_fused_experts_2stage_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + odtype, + inplace, + activation, + use_fp8_w8a8, + use_int8_w8a8, + use_int8_w8a16, + use_int4_w4a16, + use_int4_w4a8, + per_channel_quant, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1_scale, + a2_scale, + block_shape, + solution_id) + + # one stage + else: + return ck_fused_experts_1stage_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + odtype, + use_int8_w8a16, + use_int4_w4a16, + use_int8_w8a8, + use_int4_w4a8, + w1_zp, + w2_zp, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + block_shape[0] if block_shape is not None else 0, + block_shape[1] if block_shape is not None else 0, + block_m, + use_shuffle, + solution_id, + expert_map) \ No newline at end of file diff --git a/aiter/int4_utils.py b/aiter/int4_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0d4dd5f0b1837695f38ce93ab1499101f9b70690 --- /dev/null +++ b/aiter/int4_utils.py @@ -0,0 +1,56 @@ +import torch +from aiter import dtypes + +# packed_4_bits (pack) = [0, 2, 4, 6, 1, 3, 5, 7] +# (unpack) = [0, 4, 1, 5, 2, 6, 3, 7] + + +# This code is adapted from https://github.com/ROCm/vllm/blob/main/vllm/model_executor/layers/quantization/awq_triton.py + +# zeros are ignored since we use symmetric quantization +# qweight is both quantized and bit-packed alone the same row. All the bits in the same row has the same scaling factor. +# 8 INT4s are packed into one INT32. INT4 instead of UINT4 is used. + +################################################################################ +# Custom Triton Kernel & Wrapper +################################################################################ + + +def convert_int8_to_uint32_int4(tensor: torch.Tensor) -> torch.Tensor: + assert tensor.dtype == dtypes.i8, "input should be int8" + + if tensor.shape[-1] % 8 != 0: + raise ValueError("k % 8 should be zero") + + tensor_reshaped = tensor.reshape(*tensor.shape[:-1], tensor.shape[-1] // 8, 8) + high_bits = tensor_reshaped & 0x0F + merged = ( + (high_bits[:, :, :, 7].to(dtypes.i32) << 28) + | (high_bits[:, :, :, 6].to(dtypes.i32) << 24) + | (high_bits[:, :, :, 5].to(dtypes.i32) << 20) + | (high_bits[:, :, :, 4].to(dtypes.i32) << 16) + | (high_bits[:, :, :, 3].to(dtypes.i32) << 12) + | (high_bits[:, :, :, 2].to(dtypes.i32) << 8) + | (high_bits[:, :, :, 1].to(dtypes.i32) << 4) + | high_bits[:, :, :, 0].to(dtypes.i32) + ) + return merged.view(dtype=torch.uint32) + + +def rearrange_4bit_elements(tensor): + """ + GPU-optimized version for rearranging 4-bit segments within 32-bit integers + [e0, e1, e2, e3, e4, e5, e6, e7] -> [e0, e2, e4, e6, e1, e3, e5, e7] + """ + t_ = tensor.view(dtype=dtypes.i32) + + return ( + ((t_ & 0xF0000000) << 0) # e0 (bits 28-31) + | ((t_ & 0x00F00000) << 4) # e2 -> position 24-27 + | ((t_ & 0x0000F000) << 8) # e4 -> position 20-23 + | ((t_ & 0x000000F0) << 12) # e6 -> position 16-19 + | ((t_ & 0x0F000000) >> 12) # e1 -> position 12-15 + | ((t_ & 0x000F0000) >> 8) # e3 -> position 8-11 + | ((t_ & 0x00000F00) >> 4) # e5 -> position 4-7 + | (t_ & 0x0000000F) # e7 (bits 0-3) + ).view(dtype=torch.uint32) diff --git a/aiter/jit/__init__.py b/aiter/jit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0bde01b04e250caa825555c8a1926b3fbcb23ad --- /dev/null +++ b/aiter/jit/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT + \ No newline at end of file diff --git a/aiter/jit/core.py b/aiter/jit/core.py new file mode 100644 index 0000000000000000000000000000000000000000..d276d5ebf50a57a9c8287aeac02b15db5edb1e36 --- /dev/null +++ b/aiter/jit/core.py @@ -0,0 +1,973 @@ +# SPDX-License-Identifier: MIT + +import functools +import importlib +import json +import logging +import multiprocessing +import os +import re +import shutil +import sys +import time +import traceback +import types +import typing +import copy +from typing import Any, Callable, List, Optional + +from packaging.version import Version, parse + +this_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, f"{this_dir}/utils/") +from chip_info import get_gfx +from cpp_extension import _jit_compile, executable_path, get_hip_version +from file_baton import FileBaton +from torch_guard import torch_compile_guard # noqa: E402 + +AITER_REBUILD = int(os.environ.get("AITER_REBUILD", "0")) + +aiter_lib = None + + +def mp_lock( + lockPath: str, + MainFunc: Callable, + FinalFunc: Optional[Callable] = None, + WaitFunc: Optional[Callable] = None, +): + """ + Using FileBaton for multiprocessing. + """ + baton = FileBaton(lockPath) + if baton.try_acquire(): + try: + ret = MainFunc() + finally: + if FinalFunc is not None: + FinalFunc() + baton.release() + else: + baton.wait() + if WaitFunc is not None: + ret = WaitFunc() + ret = None + return ret + + +logger = logging.getLogger("aiter") + +PY = sys.executable +this_dir = os.path.dirname(os.path.abspath(__file__)) + +AITER_ROOT_DIR = os.path.abspath(f"{this_dir}/../../") +AITER_LOG_MORE = int(os.getenv("AITER_LOG_MORE", 0)) +AITER_LOG_TUNED_CONFIG = int(os.getenv("AITER_LOG_TUNED_CONFIG", 0)) + +# config_env start here +def update_config_files(file_path: str, merge_name: str): + path_list = file_path.split(os.pathsep) if file_path else [] + if len(path_list) <= 1: + return file_path + df_list = [] + ## merge config files + ##example: AITER_CONFIG_GEMM_A4W4="/path1:/path2" + import pandas as pd + + df_list.append(pd.read_csv(path_list[0])) + for i, path in enumerate(path_list[1:]): + if os.path.exists(path): + df = pd.read_csv(path) + ## check columns + assert ( + df.columns.tolist() == df_list[0].columns.tolist() + ), f"Column mismatch between {path_list[0]} and {path}, {df_list[0].columns.tolist()}, {df.columns.tolist()}" + + df_list.append(df) + else: + logger.info(f"path {i+1}: {path} (not exist)") + merge_df = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame() + ## get keys from untuned file to drop_duplicates + untuned_name = ( + re.sub(r"(?:_)?tuned$", r"\1untuned", merge_name) + if re.search(r"(?:_)?tuned$", merge_name) + else merge_name.replace("tuned", "untuned") + ) + untuned_path = f"{AITER_ROOT_DIR}/aiter/configs/{untuned_name}.csv" + if os.path.exists(untuned_path): + untunedf = pd.read_csv(untuned_path) + keys = untunedf.columns + merge_df = ( + merge_df.sort_values("us") + .drop_duplicates(subset=keys, keep="first") + .reset_index(drop=True) + ) + else: + logger.warning( + f"Untuned config file not found: {untuned_path}. Using all columns for deduplication." + ) + new_file_path = f"/tmp/{merge_name}.csv" + merge_df.to_csv(new_file_path, index=False) + return new_file_path + + +def get_config_file(env_name, default_file, tuned_file_name): + config_env_file = os.getenv(env_name) + # default_file = f"{AITER_ROOT_DIR}/aiter/configs/{tuned_file_name}.csv" + from pathlib import Path + + if not config_env_file: + model_config_dir = Path(f"{AITER_ROOT_DIR}/aiter/configs/model_configs/") + op_tuned_file_list = [ + p + for p in model_config_dir.glob(f"*{tuned_file_name}*") + if (p.is_file() and "untuned" not in str(p)) + ] + + if not op_tuned_file_list: + config_file = default_file + else: + tuned_files = ":".join(str(p) for p in op_tuned_file_list) + tuned_files = default_file + ":" + tuned_files + logger.info( + f"merge tuned file under model_configs/ and configs/ {tuned_files}" + ) + config_file = update_config_files(tuned_files, tuned_file_name) + else: + config_file = update_config_files(config_env_file, tuned_file_name) + # print(f"get config file from environment ", config_file) + return config_file + + +AITER_CONFIG_GEMM_A4W4 = os.getenv( + "AITER_CONFIG_GEMM_A4W4", + f"{AITER_ROOT_DIR}/aiter/configs/a4w4_blockscale_tuned_gemm.csv", +) +AITER_CONFIG_GEMM_A8W8 = os.getenv( + "AITER_CONFIG_GEMM_A8W8", + f"{AITER_ROOT_DIR}/aiter/configs/a8w8_tuned_gemm.csv", +) +AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE = os.getenv( + "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE", + f"{AITER_ROOT_DIR}/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv", +) +AITER_CONFIG_GEMM_A8W8_BLOCKSCALE = os.getenv( + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE", + f"{AITER_ROOT_DIR}/aiter/configs/a8w8_blockscale_tuned_gemm.csv", +) +AITER_CONFIG_FMOE = os.getenv( + "AITER_CONFIG_FMOE", + f"{AITER_ROOT_DIR}/aiter/configs/tuned_fmoe.csv", +) + +AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE = os.getenv( + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE", + f"{AITER_ROOT_DIR}/aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv", +) + +AITER_CONFIG_A8W8_BATCHED_GEMM = os.getenv( + "AITER_CONFIG_A8W8_BATCHED_GEMM", + f"{AITER_ROOT_DIR}/aiter/configs/a8w8_tuned_batched_gemm.csv", +) + +AITER_CONFIG_BF16_BATCHED_GEMM = os.getenv( + "AITER_CONFIG_BF16_BATCHED_GEMM", + f"{AITER_ROOT_DIR}/aiter/configs/bf16_tuned_batched_gemm.csv", +) + +AITER_CONFIG_GEMM_BF16 = os.getenv( + "AITER_CONFIG_GEMM_BF16", + f"{AITER_ROOT_DIR}/aiter/configs/tuned_gemm.csv", +) +AITER_CONFIG_GEMM_A4W4_FILE = get_config_file( + "AITER_CONFIG_GEMM_A4W4", AITER_CONFIG_GEMM_A4W4, "a4w4_blockscale_tuned_gemm" +) + +AITER_CONFIG_GEMM_A8W8_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8", AITER_CONFIG_GEMM_A8W8, "a8w8_tuned_gemm" +) +AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE", + AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE, + "a8w8_bpreshuffle_tuned_gemm", +) +AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE", + AITER_CONFIG_GEMM_A8W8_BLOCKSCALE, + "a8w8_blockscale_tuned_gemm", +) +AITER_CONFIG_FMOE_FILE = get_config_file( + "AITER_CONFIG_FMOE", AITER_CONFIG_FMOE, "tuned_fmoe" +) + +AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE = get_config_file( + "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE", + AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE, + "a8w8_blockscale_bpreshuffle_tuned_gemm", +) + +AITER_CONFIG_A8W8_BATCHED_GEMM_FILE = get_config_file( + "AITER_CONFIG_A8W8_BATCHED_GEMM", + AITER_CONFIG_A8W8_BATCHED_GEMM, + "a8w8_tuned_batched_gemm", +) + +AITER_CONFIG_BF16_BATCHED_GEMM_FILE = get_config_file( + "AITER_CONFIG_BF16_BATCHED_GEMM", + AITER_CONFIG_BF16_BATCHED_GEMM, + "bf16_tuned_batched_gemm", +) + +AITER_CONFIG_GEMM_BF16_FILE = get_config_file( + "AITER_CONFIG_GEMM_BF16", AITER_CONFIG_GEMM_BF16, "bf16_tuned_gemm" +) + +# config_env end here + +find_aiter = importlib.util.find_spec("aiter") +if find_aiter is not None: + if find_aiter.submodule_search_locations: + package_path = find_aiter.submodule_search_locations[0] + elif find_aiter.origin: + package_path = find_aiter.origin + package_path = os.path.dirname(package_path) + package_parent_path = os.path.dirname(package_path) + + try: + with open(f"{this_dir}/../install_mode", "r") as f: + # develop mode + isDevelopMode = f.read().strip() == "develop" + except FileNotFoundError: + # pip install -e + isDevelopMode = True + + if isDevelopMode: + AITER_META_DIR = AITER_ROOT_DIR + # install mode + else: + AITER_META_DIR = os.path.abspath(f"{AITER_ROOT_DIR}/aiter_meta/") +else: + AITER_META_DIR = AITER_ROOT_DIR + logger.warning("aiter is not installed.") +sys.path.insert(0, AITER_META_DIR) +AITER_CSRC_DIR = f"{AITER_META_DIR}/csrc" +AITER_GRADLIB_DIR = f"{AITER_META_DIR}/gradlib" +gfx = get_gfx() +AITER_ASM_DIR = f"{AITER_META_DIR}/hsa/{gfx}/" +os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR +CK_3RDPARTY_DIR = os.environ.get( + "CK_DIR", f"{AITER_META_DIR}/3rdparty/composable_kernel" +) +CK_DIR = CK_3RDPARTY_DIR + +MOE_C_3RDPARTY_DIR = os.environ.get( + "MOE_C_DIR", f"{AITER_META_DIR}/3rdparty/moe_c" +) + +MOE_C_DIR = MOE_C_3RDPARTY_DIR + +os.environ["AITER_META_DIR"] = AITER_META_DIR + + +@functools.lru_cache(maxsize=1) +def get_asm_dir(): + return AITER_ASM_DIR + + +@functools.lru_cache(maxsize=1) +def get_user_jit_dir() -> str: + if "AITER_JIT_DIR" in os.environ: + path = os.getenv("AITER_JIT_DIR", "") + os.makedirs(path, exist_ok=True) + sys.path.insert(0, path) + return path + else: + if os.access(this_dir, os.W_OK): + return this_dir + home_jit_dir = f"{os.path.expanduser('~')}/.aiter/{os.path.basename(this_dir)}" + if not os.path.exists(home_jit_dir): + shutil.copytree(this_dir, home_jit_dir) + return home_jit_dir + + +bd_dir = f"{get_user_jit_dir()}/build" +# copy ck to build, thus hippify under bd_dir +if multiprocessing.current_process().name == "MainProcess": + os.makedirs(bd_dir, exist_ok=True) + # if os.path.exists(f"{bd_dir}/ck/library"): + # shutil.rmtree(f"{bd_dir}/ck/library") +# CK_DIR = f"{bd_dir}/ck" + + +def validate_and_update_archs(): + archs = os.getenv("GPU_ARCHS", "native").split(";") + archs = [arch.strip() for arch in archs] + # List of allowed architectures + allowed_archs = [ + "native", + "gfx90a", + "gfx940", + "gfx941", + "gfx942", + "gfx1100", + "gfx950", + "gfx928", + "gfx936", + "gfx938", + "gfx946", + ] + + # Validate if each element in archs is in allowed_archs + assert all( + arch in allowed_archs for arch in archs + ), f"One of GPU archs of {archs} is invalid or not supported" + return archs + + +@functools.lru_cache() +def hip_flag_checker(flag_hip: str) -> bool: + hipcc = executable_path("hipcc") + ret = os.system(f'"{hipcc}" {flag_hip} -x hip -E -P /dev/null -o /dev/null') + if ret == 0: + return True + else: + logger.warning(f"{flag_hip} is not supported by hipcc.") + return False + +def _path_under_prefix(path: str, prefix: str) -> bool: + if not path: + return False + rp = os.path.realpath(path) + rprefix = os.path.realpath(prefix) + try: + common = os.path.commonpath([rp, rprefix]) + except ValueError: + return False + return common == rprefix + +@functools.lru_cache(maxsize=1) +def detect_dtk_env() -> bool: + # Simplified detection logic: + # 1) If 'aicc' is present (in PATH or at /opt/dtk/bin/aicc), treat it as hipcc alias and use it for compilation. + # 2) Otherwise fall back to the normal hipcc resolution (executable_path("hipcc")). + # DTK environment is determined when the selected hipcc (or ROCM_PATH) is under /opt/dtk. + + # Try to locate 'aicc' first (DTK's renamed hipcc) + aicc_path = shutil.which("aicc") + if not aicc_path: + candidate = "/opt/dtk/bin/aicc" + if os.path.exists(candidate): + aicc_path = os.path.realpath(candidate) + + hipcc = "" + hipcc_in_dtk = False + + if aicc_path: + # Use aicc as the hipcc implementation by exporting HIPCC so other code that calls executable_path("hipcc") + # will pick up the aicc binary. + hipcc = os.path.realpath(aicc_path) + os.environ["HIPCC"] = hipcc + hipcc_in_dtk = _path_under_prefix(hipcc, "/opt/dtk") + logger.info(f"Found 'aicc' and using it as hipcc: {hipcc}") + else: + # Fallback to normal hipcc resolution (may raise/abort in executable_path) + try: + hipcc = executable_path("hipcc") + except Exception: + # If executable_path fails, try a best-effort lookup via shutil.which + hipcc = shutil.which("hipcc") or "" + if hipcc: + hipcc = os.path.realpath(hipcc) + hipcc_in_dtk = _path_under_prefix(hipcc, "/opt/dtk") if hipcc else False + + # Also consider ROCM_PATH pointing under /opt/dtk + rocm_path = os.getenv("ROCM_PATH", "") + rocm_in_dtk = _path_under_prefix(rocm_path, "/opt/dtk") + + enabled = hipcc_in_dtk or rocm_in_dtk + if enabled: + logger.info( + f"DTK environment detected (hipcc={hipcc}, ROCM_PATH={rocm_path}), enabling -DDTK_ENV" + ) + else: + logger.info( + f"Non-DTK environment (hipcc={hipcc}, ROCM_PATH={rocm_path}), DTK_ENV disabled" + ) + return enabled + +def check_and_set_ninja_worker(): + max_num_jobs_cores = max(1, os.cpu_count() * 0.8) + import psutil + + # calculate the maximum allowed NUM_JOBS based on free memory + free_memory_gb = psutil.virtual_memory().available / (1024**3) # free memory in GB + max_num_jobs_memory = int(free_memory_gb / 0.5) # assuming 0.5 GB per job + + # pick lower value of jobs based on cores vs memory metric to minimize oom and swap usage during compilation + max_jobs = int(max(1, min(max_num_jobs_cores, max_num_jobs_memory))) + max_jobs_env = os.environ.get("MAX_JOBS") + if max_jobs_env is not None: + try: + max_processes = int(max_jobs_env) + # too large value + if max_processes > max_jobs: + os.environ["MAX_JOBS"] = str(max_jobs) + # error value + except ValueError: + os.environ["MAX_JOBS"] = str(max_jobs) + # none value + else: + os.environ["MAX_JOBS"] = str(max_jobs) + + +def rename_cpp_to_cu(els, dst, hipify, recursive=False): + def do_rename_and_mv(name, src, dst, ret): + newName = name + if hipify: + if name.endswith(".cpp") or name.endswith(".cu"): + newName = name.replace(".cpp", ".cu") + ret.append(f"{dst}/{newName}") + shutil.copy(f"{src}/{name}", f"{dst}/{newName}") + else: + if name.endswith(".cpp") or name.endswith(".cu"): + ret.append(f"{src}/{newName}") + + ret = [] + for el in els: + if not os.path.exists(el): + logger.warning(f"---> {el} not exists!!!!!!") + continue + if os.path.isdir(el): + for entry in os.listdir(el): + if os.path.isdir(f"{el}/{entry}"): + if recursive: + ret += rename_cpp_to_cu( + [f"{el}/{entry}"], dst, hipify, recursive + ) + continue + do_rename_and_mv(entry, el, dst, ret) + else: + do_rename_and_mv(os.path.basename(el), os.path.dirname(el), dst, ret) + return ret + + +@torch_compile_guard() +def check_numa_custom_op() -> None: + numa_balance_set = os.popen("cat /proc/sys/kernel/numa_balancing").read().strip() + if numa_balance_set == "1": + logger.warning( + "WARNING: NUMA balancing is enabled, which may cause errors. " + "It is recommended to disable NUMA balancing by running \"sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'\" " + ) + + +@functools.lru_cache() +def check_numa(): + check_numa_custom_op() + + +__mds = {} + + +@torch_compile_guard() +def get_module_custom_op(md_name: str) -> None: + global __mds + if md_name not in __mds: + if "AITER_JIT_DIR" in os.environ: + __mds[md_name] = importlib.import_module(md_name) + else: + __mds[md_name] = importlib.import_module(f"{__package__}.{md_name}") + + if AITER_LOG_MORE: + logger.info(f"import [{md_name}] under {__mds[md_name].__file__}") + return + + +@functools.lru_cache(maxsize=1024) +def get_module(md_name): + check_numa() + get_module_custom_op(md_name) + return __mds[md_name] + + +rebuilded_list = ["module_aiter_enum"] + + +def rm_module(md_name): + os.system(f"rm -rf {get_user_jit_dir()}/{md_name}.so") + + +def clear_build(md_name): + os.system(f"rm -rf {bd_dir}/{md_name}") + + +def build_module( + md_name, + srcs, + flags_extra_cc, + flags_extra_hip, + blob_gen_cmd, + extra_include, + extra_ldflags, + verbose, + is_python_module, + is_standalone, + torch_exclude, + hipify=False, +): + lock_path = f"{bd_dir}/lock_{md_name}" + startTS = time.perf_counter() + target_name = f"{md_name}.so" if not is_standalone else md_name + + def MainFunc(): + if AITER_REBUILD == 1: + rm_module(md_name) + clear_build(md_name) + elif AITER_REBUILD >= 2: + rm_module(md_name) + op_dir = f"{bd_dir}/{md_name}" + logger.info(f"start build [{md_name}] under {op_dir}") + + opbd_dir = f"{op_dir}/build" + src_dir = f"{op_dir}/build/srcs" + os.makedirs(src_dir, exist_ok=True) + + if os.path.exists(f"{get_user_jit_dir()}/{target_name}"): + os.remove(f"{get_user_jit_dir()}/{target_name}") + + sources = rename_cpp_to_cu(srcs, src_dir, hipify) + + flags_cc = ["-O3", "-std=c++20"] + flags_hip = [ + # "-DLEGACY_HIPBLAS_DIRECT", + "-DUSE_PROF_API=1", + "-D__HIP_PLATFORM_HCC__=1", + "-D__HIP_PLATFORM_AMD__=1", + "-U__HIP_NO_HALF_CONVERSIONS__", + "-U__HIP_NO_HALF_OPERATORS__", + "-mllvm --amdgpu-kernarg-preload-count=16", + # "-v --save-temps", + "-Wno-unused-result", + "-Wno-switch-bool", + "-Wno-vla-cxx-extension", + "-Wno-undefined-func-template", + "-Wno-macro-redefined", + "-Wno-missing-template-arg-list-after-template-kw", + "-fgpu-flush-denormals-to-zero", + ] + + # Imitate https://github.com/ROCm/composable_kernel/blob/c8b6b64240e840a7decf76dfaa13c37da5294c4a/CMakeLists.txt#L190-L214 + hip_version = parse(get_hip_version().split()[-1].rstrip("-").replace("-", "+")) + if hip_version > Version("5.5.00000"): + flags_hip += ["-mllvm --lsr-drop-solution=1"] + if hip_version > Version("5.7.23302"): + flags_hip += ["-fno-offload-uniform-block"] + if hip_version > Version("6.1.40090"): + flags_hip += ["-mllvm -enable-post-misched=0"] + if hip_version > Version("6.2.41132"): + flags_hip += [ + "-mllvm -amdgpu-early-inline-all=true", + "-mllvm -amdgpu-function-calls=false", + ] + if hip_version > Version("6.2.41133"): + flags_hip += ["-mllvm -amdgpu-coerce-illegal-types=1"] + if get_gfx() == "gfx946" and int(os.getenv("AITER_FP4x2", "1")) > 0: + flags_hip += ["-D__Float4_e2m1fn_x2"] + + if not torch_exclude: + import torch + + if hasattr(torch, "float4_e2m1fn_x2"): + flags_hip += ["-DTORCH_Float4_e2m1fn_x2"] + + # Enable DTK code path only when hipcc/ROCM_PATH indicates /opt/dtk + if detect_dtk_env(): + flags_cc.append("-DDTK_ENV") + flags_hip.append("-DDTK_ENV") + + flags_cc += flags_extra_cc + flags_hip += flags_extra_hip + archs = validate_and_update_archs() + flags_hip += [f"--offload-arch={arch}" for arch in archs] + if any(arch == "gfx938" for arch in archs) or get_gfx()=="gfx938": + flags_hip.append("-DGPU_ENABLE_FP8") # device + flags_cc.append("-DGPU_ENABLE_FP8") # host + + flags_hip = sorted(set(flags_hip)) # remove same flags + flags_hip = [el for el in flags_hip if hip_flag_checker(el)] + check_and_set_ninja_worker() + + def exec_blob(blob_gen_cmd, op_dir, src_dir, sources): + if blob_gen_cmd: + blob_dir = f"{op_dir}/blob" + os.makedirs(blob_dir, exist_ok=True) + if AITER_LOG_MORE: + logger.info(f"exec_blob ---> {PY} {blob_gen_cmd.format(blob_dir)}") + os.system(f"{PY} {blob_gen_cmd.format(blob_dir)}") + sources += rename_cpp_to_cu([blob_dir], src_dir, hipify, recursive=True) + return sources + + if isinstance(blob_gen_cmd, list): + for s_blob_gen_cmd in blob_gen_cmd: + sources = exec_blob(s_blob_gen_cmd, op_dir, src_dir, sources) + else: + sources = exec_blob(blob_gen_cmd, op_dir, src_dir, sources) + + extra_include_paths = [ + f"{CK_DIR}/include", + f"{CK_DIR}/library/include", + ] + if not hipify: + extra_include_paths += [ + f"{AITER_CSRC_DIR}/include", + f"{op_dir}/blob", + ] + extra_include + if not is_standalone: + extra_include_paths += [f"{AITER_CSRC_DIR}/include/torch"] + else: + old_bd_include_dir = f"{op_dir}/build/include" + extra_include_paths.append(old_bd_include_dir) + os.makedirs(old_bd_include_dir, exist_ok=True) + rename_cpp_to_cu( + [f"{AITER_CSRC_DIR}/include"] + extra_include, + old_bd_include_dir, + hipify, + ) + + if not is_standalone: + bd_include_dir = f"{op_dir}/build/include/torch" + os.makedirs(bd_include_dir, exist_ok=True) + rename_cpp_to_cu( + [f"{AITER_CSRC_DIR}/include/torch"], + bd_include_dir, + hipify, + ) + + try: + _jit_compile( + md_name, + sorted(set(sources)), + extra_cflags=flags_cc, + extra_cuda_cflags=flags_hip, + extra_ldflags=extra_ldflags, + extra_include_paths=extra_include_paths, + build_directory=opbd_dir, + verbose=verbose or AITER_LOG_MORE > 1, + with_cuda=True, + is_python_module=is_python_module, + is_standalone=is_standalone, + torch_exclude=torch_exclude, + hipify=hipify, + ) + if is_python_module and not is_standalone: + shutil.copy(f"{opbd_dir}/{target_name}", f"{get_user_jit_dir()}") + else: + shutil.copy( + f"{opbd_dir}/{target_name}", f"{AITER_ROOT_DIR}/op_tests/cpp/mha" + ) + except Exception as e: + tag = f"\033[31mfailed jit build [{md_name}]\033[0m" + logger.error( + f"{tag}\u2193\u2193\u2193\u2193\u2193\u2193\u2193\u2193\u2193\u2193\n-->[History]: {{}}{tag}\u2191\u2191\u2191\u2191\u2191\u2191\u2191\u2191\u2191\u2191".format( + re.sub( + "error:", + "\033[31merror:\033[0m", + "-->".join(traceback.format_exception(*sys.exc_info())), + flags=re.I, + ), + ) + ) + raise SystemExit( + f"[aiter] build [{md_name}] under {opbd_dir} failed !!!!!!" + ) from e + + def FinalFunc(): + logger.info( + f"\033[32mfinish build [{md_name}], cost {time.perf_counter()-startTS:.1f}s \033[0m" + ) + + mp_lock(lockPath=lock_path, MainFunc=MainFunc, FinalFunc=FinalFunc) + + +def get_args_of_build(ops_name: str, exclude=[]): + d_opt_build_args = { + "srcs": [], + "md_name": "", + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": None, + "extra_include": [], + "verbose": False, + "hipify": False, + "is_python_module": True, + "is_standalone": False, + "torch_exclude": False, + "hip_clang_path": None, + "blob_gen_cmd": "", + "skip_if": False, + } + + def convert(d_ops: dict): + converted_ops = {} + for k, val in d_ops.items(): + if isinstance(val, list): + converted_list = list(val) + for idx, el in enumerate(val): + if isinstance(el, str): + if "torch" in el: + import torch as torch + converted_list[idx] = eval(el) + else: + converted_list[idx] = el + converted_ops[k] = converted_list + elif isinstance(val, str): + converted_ops[k] = eval(val) + else: + converted_ops[k] = val + + # undefined compile features will be replaced with default value + resolved_build_args = copy.deepcopy(d_opt_build_args) + resolved_build_args.update(converted_ops) + return resolved_build_args + + with open(this_dir + "/optCompilerConfig.json", "r") as file: + data = json.load(file) + if isinstance(data, dict): + # parse all ops, return list + if ops_name == "all": + all_ops_list = [] + d_all_ops = { + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_include": [], + "extra_ldflags": [], + "blob_gen_cmd": [], + } + # traverse opts + for ops_name, d_ops in data.items(): + # Cannot contain tune ops + if ops_name.endswith("tune"): + continue + # exclude + if ops_name in exclude: + continue + single_ops = convert(d_ops) + d_single_ops = { + "md_name": ops_name, + "srcs": single_ops["srcs"], + "flags_extra_cc": single_ops["flags_extra_cc"], + "flags_extra_hip": single_ops["flags_extra_hip"], + "extra_include": single_ops["extra_include"], + "extra_ldflags": single_ops["extra_ldflags"], + "blob_gen_cmd": single_ops["blob_gen_cmd"], + "verbose": single_ops["verbose"], + "hipify": single_ops["hipify"], + "skip_if": single_ops.get("skip_if", False), + } + for k in d_all_ops.keys(): + if isinstance(single_ops[k], list): + d_all_ops[k] += single_ops[k] + elif isinstance(single_ops[k], str) and single_ops[k] != "": + d_all_ops[k].append(single_ops[k]) + all_ops_list.append(d_single_ops) + + return all_ops_list, d_all_ops + # no find opt_name in json. + elif data.get(ops_name) is None: + logger.warning( + "Not found this operator (" + + ops_name + + ") in 'optCompilerConfig.json'. " + ) + return d_opt_build_args + # parser single opt + else: + compile_ops_ = data.get(ops_name) + return convert(compile_ops_) + else: + logger.warning( + "ERROR: pls use dict_format to write 'optCompilerConfig.json'! " + ) + +def compile_ops( + _md_name: str, + fc_name: Optional[str] = None, + gen_func: Optional[Callable[..., dict[str, Any]]] = None, + gen_fake: Optional[Callable[..., Any]] = None, +): + def decorator(func): + func.arg_checked = False + + @functools.wraps(func) + def wrapper(*args, custom_build_args={}, **kwargs): + loadName = fc_name + md_name = _md_name + if fc_name is None: + loadName = func.__name__ + try: + module = None + if gen_func is not None: + custom_build_args.update(gen_func(*args, **kwargs)) + elif AITER_REBUILD and md_name not in rebuilded_list: + rebuilded_list.append(md_name) + raise ModuleNotFoundError("start rebuild") + if module is None: + try: + module = get_module(md_name) + except Exception as e: + md = custom_build_args.get("md_name", md_name) + module = get_module(md) + except ModuleNotFoundError: + d_args = get_args_of_build(md_name) + d_args.update(custom_build_args) + + if d_args.get("skip_if", False): + logger.info(f"skip build [{md_name}] due to skip_if condition") + return None + + # update module if we have coustom build + md_name = custom_build_args.get("md_name", md_name) + + srcs = d_args["srcs"] + flags_extra_cc = d_args["flags_extra_cc"] + flags_extra_hip = d_args["flags_extra_hip"] + blob_gen_cmd = d_args["blob_gen_cmd"] + extra_include = d_args["extra_include"] + extra_ldflags = d_args["extra_ldflags"] + verbose = d_args["verbose"] + is_python_module = d_args["is_python_module"] + is_standalone = d_args["is_standalone"] + torch_exclude = d_args["torch_exclude"] + hipify = d_args.get("hipify", False) + hip_clang_path = d_args.get("hip_clang_path", None) + prev_hip_clang_path = None + if hip_clang_path is not None and os.path.exists(hip_clang_path): + prev_hip_clang_path = os.environ.get("HIP_CLANG_PATH", None) + os.environ["HIP_CLANG_PATH"] = hip_clang_path + build_module( + md_name, + srcs, + flags_extra_cc, + flags_extra_hip, + blob_gen_cmd, + extra_include, + extra_ldflags, + verbose, + is_python_module, + is_standalone, + torch_exclude, + hipify, + ) + + if hip_clang_path is not None: + if prev_hip_clang_path is not None: + os.environ["HIP_CLANG_PATH"] = prev_hip_clang_path + else: + os.environ.pop("HIP_CLANG_PATH", None) + + if is_python_module: + module = get_module(md_name) + if md_name not in __mds: + __mds[md_name] = module + + if isinstance(module, types.ModuleType): + op = getattr(module, loadName) + else: + return None + + def check_args(): + get_asm_dir() + import inspect + import re + + import torch + + enum_types = ["ActivationType", "QuantType"] + + if not op.__doc__.startswith("Members:"): + doc_str = op.__doc__.split("\n")[0] + doc_str = re.sub(r"<(.*?)\:.*?>", r"\g<1>", doc_str) + doc_str = doc_str.replace("list[", "List[") + doc_str = doc_str.replace("tuple[", "Tuple[") + doc_str = doc_str.replace("collections.abc.Sequence[", "List[") + doc_str = doc_str.replace("typing.SupportsInt", "int") + doc_str = doc_str.replace("typing.SupportsFloat", "float") + # A|None --> Optional[A] + pattern = r"([\w\.]+(?:\[[^\]]+\])?)\s*\|\s*None" + doc_str = re.sub(pattern, r"Optional[\1]", doc_str) + for el in enum_types: + doc_str = re.sub(f" aiter.*{el} ", f" {el} ", doc_str) + namespace = { + "List": List, + "Optional": Optional, + "torch": torch, + "typing": typing, + } + + exec( + f"from aiter import*\ndef {doc_str}: pass", + namespace, + ) + foo = namespace[doc_str.split("(")[0]] + sig = inspect.signature(foo) + func.__signature__ = sig + ann = {k: v.annotation for k, v in sig.parameters.items()} + ann["return"] = sig.return_annotation + callargs = inspect.getcallargs(func, *args, **kwargs) + for el, arg in callargs.items(): + expected_type = ann[el] + got_type = type(arg) + origin = typing.get_origin(expected_type) + sub_t = typing.get_args(expected_type) + + if origin is None: + if not isinstance(arg, expected_type) and not ( + # aiter_enum can be int + any(el in str(expected_type) for el in enum_types) + and isinstance(arg, int) + ): + raise TypeError( + f"{loadName}: {el} needs to be {expected_type} but got {got_type}" + ) + elif origin is list: + if ( + not isinstance(arg, list) + # or not all(isinstance(i, sub_t) for i in arg) + ): + raise TypeError( + f"{loadName}: {el} needs to be List[{sub_t}] but got {arg}" + ) + elif origin is typing.Union or origin is types.UnionType: + if arg is not None and not isinstance(arg, sub_t): + raise TypeError( + f"{loadName}: {el} needs to be Optional[{sub_t}] but got {arg}" + ) + else: + raise TypeError(f"Unsupported type: {expected_type}") + + func_hints = typing.get_type_hints(func) + if ann["return"] is None: + func_hints["return"] = None + # if ann != func_hints: + # logger.warning( + # f"type hints mismatch, override to --> {doc_str}" + # ) + return True + + if not func.arg_checked: + func.arg_checked = check_args() + + if AITER_LOG_MORE == 2: + from ..test_common import log_args + + log_args(func, *args, **kwargs) + + return op(*args, **kwargs) + + @torch_compile_guard(device="cuda", gen_fake=gen_fake, calling_func_=func) + def custom_wrapper(*args, **kwargs): + return wrapper(*args, **kwargs) + + return custom_wrapper + + return decorator diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json new file mode 100644 index 0000000000000000000000000000000000000000..cecfcacb7d8bc6fb2ed8c977e9faa140536a179e --- /dev/null +++ b/aiter/jit/optCompilerConfig.json @@ -0,0 +1,390 @@ +{ + "module_aiter_enum": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/aiter_enum_pybind.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "torch_exclude": "False", + "blob_gen_cmd": "''" + }, + "module_activation": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/activation_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/activation_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": ["'-DENABLE_FP8'"], + "extra_ldflags": "None", + "extra_include": ["f'{AITER_CSRC_DIR}/include/ck_tile'"], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_custom_all_reduce": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/custom_all_reduce_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/custom_all_reduce.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_moe_sorting": { + "srcs": [ + "f'{AITER_CSRC_DIR}/py_itfs_ck/moe_sorting_kernels.cu'", + "f'{AITER_CSRC_DIR}/pybind/moe_sorting_pybind.cu'", + "f'{CK_DIR}/example_hcu/ck_tile/13_moe_sorting/'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{CK_DIR}/example_hcu/ck_tile/13_moe_sorting/'" + ], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_moe_sum": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/moe_sum_pybind.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_moe": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/moe_ck_pybind.cu'", + "f'{CK_DIR}/example_hcu/ck_tile/17_fused_moe/instances'", + "f'{CK_DIR}/example_hcu/ck_tile/17_fused_moe/moe_2stage'", + "f'{CK_DIR}/example_hcu/ck_tile/18_moe_quant/instances'", + "f'{AITER_CSRC_DIR}/py_itfs_ck/moe_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{CK_DIR}/example_hcu/ck_tile/17_fused_moe'", + "f'{CK_DIR}/example_hcu/ck_tile/18_moe_quant'" + ], + "verbose": "False", + "hipify": "True", + "blob_gen_cmd": "''" + }, + "module_moe_utils":{ + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/moe_utils_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/topk_softmax_kernels.cu'", + "f'{AITER_CSRC_DIR}/kernels/topk_softmax_kernels_group.cu'", + "f'{AITER_CSRC_DIR}/kernels/moe_fused_gate.cu'", + "f'{AITER_CSRC_DIR}/kernels/moe_align_sum_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": ["'-DENABLE_FP8'"], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/include/ck_tile'" + ], + "verbose": "False", + "hifify": "True", + "blob_gen_cmd": "''" + }, + "module_moe_asm": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/moe_asm_2stages_pybind.cu'", + "f'{AITER_CSRC_DIR}/py_itfs_asm/asm_fmoe_2stage.cpp'", + "f'{AITER_CSRC_DIR}/py_itfs_asm/asm_fmoe_a8.cpp'", + "f'{AITER_CSRC_DIR}/py_itfs_asm/asm_fmoe_solutions.cpp'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "hipify":"True", + "blob_gen_cmd": "''" + }, + "module_awq_gemm_asm": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/awq_gemm_asm_pybind.cu'", + "f'{AITER_CSRC_DIR}/py_itfs_asm/asm_gemm_awq.cpp'", + "f'{AITER_CSRC_DIR}/py_itfs_asm/asm_gemm_kernel_config.cpp'" + ], + "flags_extra_cc": [ + "f'-DAITER_OPT_KERNEL_CONFIG_PATH=\"{AITER_CSRC_DIR}/py_itfs_asm/optKernelManifest.json\"'" + ], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/py_itfs_asm'" + ], + "verbose": "False", + "hipify": "True", + "blob_gen_cmd": "''" + }, + "module_awq_dq_asm": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/awq_dq_asm_pybind.cu'", + "f'{AITER_CSRC_DIR}/py_itfs_asm/asm_dq_awq.cpp'" + ], + "flags_extra_cc": [ + "f'-DAITER_OPT_KERNEL_CONFIG_PATH=\"{AITER_CSRC_DIR}/py_itfs_asm/optKernelManifest.json\"'" + ], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/py_itfs_asm'" + ], + "verbose": "False", + "hipify": "True", + "blob_gen_cmd": "''" + }, + "module_norm": { + "srcs": [ + "f'{AITER_CSRC_DIR}/py_itfs_ck/norm_kernels.cu'", + "f'{AITER_CSRC_DIR}/pybind/norm_pybind.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{CK_DIR}/example_hcu/ck_tile/02_layernorm2d'" + ], + "verbose": "False", + "blob_gen_cmd": "f'{CK_DIR}/example_hcu/ck_tile/02_layernorm2d/generate.py --api fwd --gen_blobs --working_path {{}}'" + }, + "module_pos_encoding": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/pos_encoding_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/pos_encoding_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_rmsnorm": { + "srcs": [ + "f'{AITER_CSRC_DIR}/kernels/rmsnorm_kernels.cu'", + "f'{AITER_CSRC_DIR}/py_itfs_ck/rmsnorm_ck_kernels.cu'", + "f'{AITER_CSRC_DIR}/pybind/rmsnorm_pybind.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [ + "f'{CK_DIR}/example_hcu/ck_tile/10_rmsnorm2d'" + ], + "verbose": "False", + "blob_gen_cmd": "f'{CK_DIR}/example_hcu/ck_tile/10_rmsnorm2d/generate.py --api fwd --gen_blobs --working_path {{}}'" + }, + "module_aiter_operator": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/aiter_operator_pybind.cu'", + "f'{AITER_CSRC_DIR}/include/binary_operator.cuh'", + "f'{AITER_CSRC_DIR}/kernels/binary_operator.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "f'{AITER_CSRC_DIR}/kernels/generate_binaryop.py --working_path {{}} --optype all --dtypes all'" + }, + "module_aiter_unary": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/aiter_unary_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/unary_operator.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_quant": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/quant_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/quant_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [ + "'-DENABLE_FP8'" + ], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/include/ck_tile'" + ], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_rope_general_fwd": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/rope_general_fwd_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/rope/rope_common.h'", + "f'{AITER_CSRC_DIR}/kernels/rope/general_fwd_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_rope_general_bwd": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/rope_general_bwd_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/rope/rope_common.h'", + "f'{AITER_CSRC_DIR}/kernels/rope/general_bwd_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_rope_pos_fwd": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/rope_pos_fwd_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/rope/rope_common.h'", + "f'{AITER_CSRC_DIR}/kernels/rope/pos_fwd_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_fused_qk_norm_mrope_cache_quant_shuffle": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/fused_qk_norm_mrope_cache_quant_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/rope/rope_common.h'", + "f'{AITER_CSRC_DIR}/kernels/fused_qk_norm_mrope_cache_quant.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_fused_qk_norm_rope_cache_quant_shuffle": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/fused_qk_norm_rope_cache_quant_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/rope/rope_common.h'", + "f'{AITER_CSRC_DIR}/kernels/fused_qk_norm_rope_cache_quant.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [ + "'-DENABLE_FP8'" + ], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/include/ck_tile'", + "f'{AITER_CSRC_DIR}/include/opus'" + ], + "verbose": "False", + "blob_gen_cmd": "''" + }, + "module_rocsolgemm": { + "srcs": [ + "f'{AITER_GRADLIB_DIR}/csrc/rocsolgemm.cu'" + ], + "flags_extra_cc": [ + "'-O3'" + ], + "flags_extra_hip": [ + "'-O3'", + "'-U__CUDA_NO_HALF_OPERATORS__'", + "'-U__CUDA_NO_HALF_CONVERSIONS__'", + "'-ftemplate-depth=1024'" + ], + "extra_ldflags": ["'-lrocblas'"], + "extra_include": [ + "f'{AITER_GRADLIB_DIR}/include/'" + ], + "hipify": "True", + "verbose": "False", + "blob_gen_cmd": "''", + "skip_if": "detect_dtk_env()" + }, + "module_hipbsolgemm": { + "srcs": [ + "f'{AITER_GRADLIB_DIR}/csrc/hipbsolgemm.cu'" + ], + "flags_extra_cc": [ + "'-O3'" + ], + "flags_extra_hip": [ + "'-O3'", + "'-U__CUDA_NO_HALF_OPERATORS__'", + "'-U__CUDA_NO_HALF_CONVERSIONS__'", + "'-ftemplate-depth=1024'", + "'-DENABLE_TORCH_FP8' if hasattr(torch, 'float8_e4m3fn') else '' " + ], + "extra_ldflags": ["'-lhipblaslt'"], + "extra_include": [ + "f'{AITER_GRADLIB_DIR}/include/'" + ], + "hipify": "True", + "verbose": "False", + "blob_gen_cmd": "''", + "skip_if": "detect_dtk_env()" + }, + "module_moe_c_kernel": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/moe_c_pybind.cu'", + "f'{MOE_C_DIR}/csrc_for_aiter'", + "f'{AITER_CSRC_DIR}/py_itfs_moe_c/moe_c.cu'" + ], + "flags_extra_cc": ["' -mllvm -support-768-vgprs=true -mllvm -disable-machine-sink '" + ], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "hipify": "True", + "blob_gen_cmd": "''" + }, + "module_topk_plain": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/topk_plain_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/topk_plain_kernels.cu'", + "f'{AITER_CSRC_DIR}/kernels/topk_per_row_kernels.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "hipify": "True", + "blob_gen_cmd": "''" + }, + "module_topk_transform": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/topk_transform_pybind.cu'", + "f'{AITER_CSRC_DIR}/kernels/topk_transform.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "hipify": "True", + "blob_gen_cmd": "''" + } +} diff --git a/aiter/jit/utils/__init__.py b/aiter/jit/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0bde01b04e250caa825555c8a1926b3fbcb23ad --- /dev/null +++ b/aiter/jit/utils/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT + \ No newline at end of file diff --git a/aiter/jit/utils/_cpp_extension_versioner.py b/aiter/jit/utils/_cpp_extension_versioner.py new file mode 100644 index 0000000000000000000000000000000000000000..35730ce03f517ccd9cea8d248de1d4022a6d2aa7 --- /dev/null +++ b/aiter/jit/utils/_cpp_extension_versioner.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: MIT + # mypy: allow-untyped-defs +import collections + + +Entry = collections.namedtuple("Entry", "version, hash") + + +def update_hash(seed, value): + # Good old boost::hash_combine + # https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html + return seed ^ (hash(value) + 0x9E3779B9 + (seed << 6) + (seed >> 2)) + + +def hash_source_files(hash_value, source_files): + for filename in source_files: + with open(filename, "rb") as file: + hash_value = update_hash(hash_value, file.read()) + return hash_value + + +def hash_build_arguments(hash_value, build_arguments): + for group in build_arguments: + if group: + for argument in group: + hash_value = update_hash(hash_value, argument) + return hash_value + + +class ExtensionVersioner: + def __init__(self): + self.entries = {} + + def get_version(self, name): + entry = self.entries.get(name) + return None if entry is None else entry.version + + def bump_version_if_changed( + self, + name, + source_files, + build_arguments, + build_directory, + with_cuda, + is_python_module, + is_standalone, + ): + hash_value = 0 + hash_value = hash_source_files(hash_value, source_files) + hash_value = hash_build_arguments(hash_value, build_arguments) + hash_value = update_hash(hash_value, build_directory) + hash_value = update_hash(hash_value, with_cuda) + hash_value = update_hash(hash_value, is_python_module) + hash_value = update_hash(hash_value, is_standalone) + + entry = self.entries.get(name) + if entry is None: + self.entries[name] = entry = Entry(0, hash_value) + elif hash_value != entry.hash: + self.entries[name] = entry = Entry(entry.version + 1, hash_value) + + return entry.version diff --git a/aiter/jit/utils/chip_info.py b/aiter/jit/utils/chip_info.py new file mode 100644 index 0000000000000000000000000000000000000000..7bf8c55aa32a9e1b90ef383257620afca988bd29 --- /dev/null +++ b/aiter/jit/utils/chip_info.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: MIT +import os +import functools +import subprocess + + +@functools.lru_cache(maxsize=1) +def get_gfx(): + gfx = os.getenv("GPU_ARCHS", "native") + if gfx == "native": + try: + result = subprocess.run( + ["rocminfo"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + output = result.stdout + for line in output.split("\n"): + if "gfx" in line.lower(): + return line.split(":")[-1].strip() + except Exception as e: + raise RuntimeError(f"Get GPU arch from rcominfo failed {str(e)}") + return gfx + + +@functools.lru_cache(maxsize=1) +def get_cu_num(): + import torch + + device = torch.cuda.current_device() + cu_num = torch.cuda.get_device_properties(device).multi_processor_count + return cu_num diff --git a/aiter/jit/utils/cpp_extension.py b/aiter/jit/utils/cpp_extension.py new file mode 100644 index 0000000000000000000000000000000000000000..98f1836995ff53f59d072a7ee05087a727693f3e --- /dev/null +++ b/aiter/jit/utils/cpp_extension.py @@ -0,0 +1,1770 @@ +# SPDX-License-Identifier: MIT + +# This file origins from pytorch: +# https://github.com/pytorch/pytorch/blob/main/torch/utils/cpp_extension.py +# We make slight changes to enable ninja response file +# mypy: allow-untyped-defs +import copy +import importlib +import importlib.abc +import os +import re +import shlex +import shutil +import subprocess +import sys +import sysconfig +import warnings +from typing import Dict, List, Optional, Tuple, Union + +import setuptools +from _cpp_extension_versioner import ExtensionVersioner +from file_baton import FileBaton +from hipify import hipify_python +from hipify.hipify_python import GeneratedFileCleaner +from packaging.version import Version +from setuptools.command.build_ext import build_ext + +IS_WINDOWS = sys.platform == "win32" +IS_LINUX = sys.platform.startswith("linux") +LIB_EXT = ".so" +EXEC_EXT = "" +CLIB_PREFIX = "lib" +CLIB_EXT = ".so" +SHARED_FLAG = "-shared" + +SUBPROCESS_DECODE_ARGS = () +MINIMUM_GCC_VERSION = (5, 0, 0) +MINIMUM_MSVC_VERSION = (19, 0, 24215) + +VersionRange = Tuple[Tuple[int, ...], Tuple[int, ...]] +VersionMap = Dict[str, VersionRange] +# The following values were taken from the following GitHub gist that +# summarizes the minimum valid major versions of g++/clang++ for each supported +# CUDA version: https://gist.github.com/ax3l/9489132 +# Or from include/crt/host_config.h in the CUDA SDK +# The second value is the exclusive(!) upper bound, i.e. min <= version < max + +MINIMUM_CLANG_VERSION = (3, 3, 0) + +__all__ = [ + "check_compiler_ok_for_platform", + "get_compiler_abi_compatibility_and_version", + "BuildExtension", + "CppExtension", + "CUDAExtension", + "include_paths", + "library_paths", + "load", + "is_ninja_available", + "verify_ninja_availability", + "get_cxx_compiler", + "check_compiler_is_gcc", +] + + +def executable_path(executable: str) -> str: + """ + Return the path to the executable. + + Args: + executable (str): The name of the executable. + + Returns: + The path to the executable. + """ + env_override = os.environ.get(executable.upper()) + candidate_paths = [env_override, shutil.which(executable)] + + home = _find_rocm_home() + if home: + candidate_paths.extend( + [ + os.path.join(home, "bin", executable), + os.path.join(home, "hip", "bin", executable), + os.path.join(home, "llvm", "bin", executable), + ] + ) + + path = next( + ( + os.path.realpath(candidate) + for candidate in candidate_paths + if candidate and os.path.exists(candidate) + ), + None, + ) + assert path is not None, ( + f"Could not find {executable} in PATH or ROCM_HOME({home})" + ) + return os.path.realpath(path) + + +def get_hip_version(): + try: + hipconfig = executable_path("hipconfig") + output = subprocess.check_output([hipconfig, "--version"], text=True) + return output + except Exception: + raise RuntimeError("ROCm version file not found") + + +def _find_rocm_home() -> Optional[str]: + """Find the ROCm install path.""" + # Guess #1 + rocm_home = os.environ.get("ROCM_HOME") or os.environ.get("ROCM_PATH") + if rocm_home is None: + # Guess #2 + hipcc_path = shutil.which("hipcc") + if hipcc_path is not None: + rocm_home = os.path.dirname(os.path.dirname(os.path.realpath(hipcc_path))) + # can be either /hip/bin/hipcc or /bin/hipcc + if os.path.basename(rocm_home) == "hip": + rocm_home = os.path.dirname(rocm_home) + else: + # Guess #3 + fallback_path = "/opt/rocm" + if os.path.exists(fallback_path): + rocm_home = fallback_path + if rocm_home is None: + print( + f"No ROCm runtime is found, using ROCM_HOME='{rocm_home}'", file=sys.stderr + ) + return rocm_home + + +def _join_rocm_home(*paths) -> str: + """ + Join paths with ROCM_HOME, or raises an error if it ROCM_HOME is not set. + + This is basically a lazy way of raising an error for missing $ROCM_HOME + only once we need to get any ROCm-specific path. + """ + if ROCM_HOME is None: + raise OSError( + "ROCM_HOME environment variable is not set. " + "Please set it to your ROCm install root." + ) + elif IS_WINDOWS: + raise OSError( + "Building PyTorch extensions using " "ROCm and Windows is not supported." + ) + return os.path.join(ROCM_HOME, *paths) + + +ABI_INCOMPATIBILITY_WARNING = """ + + !! WARNING !! + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +Your compiler ({}) may be ABI-incompatible with PyTorch! +Please use a compiler that is ABI-compatible with GCC 5.0 and above. +See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html. + +See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6 +for instructions on how to install GCC 5 or higher. +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + !! WARNING !! +""" +WRONG_COMPILER_WARNING = """ + + !! WARNING !! + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +Your compiler ({user_compiler}) is not compatible with the compiler Pytorch was +built with for this platform, which is {pytorch_compiler} on {platform}. Please +use {pytorch_compiler} to to compile your extension. Alternatively, you may +compile PyTorch from source using {user_compiler}, and then you can also use +{user_compiler} to compile your extension. + +See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help +with compiling PyTorch from source. +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + !! WARNING !! +""" + +HIP_VERSION = get_hip_version() +ROCM_HOME = _find_rocm_home() +HIP_HOME = _join_rocm_home("hip") if ROCM_HOME else None +IS_HIP_EXTENSION = ( + True if ((ROCM_HOME is not None) and (HIP_VERSION is not None)) else False +) +ROCM_VERSION = None +if HIP_VERSION is not None: + ROCM_VERSION = tuple(int(v) for v in HIP_VERSION.split(".")[:2]) + +# PyTorch releases have the version pattern major.minor.patch, whereas when +# PyTorch is built from source, we append the git commit hash, which gives +# it the below pattern. +BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r"\d+\.\d+\.\d+\w+\+\w+") + +COMMON_MSVC_FLAGS = [ + "/MD", + "/wd4819", + "/wd4251", + "/wd4244", + "/wd4267", + "/wd4275", + "/wd4018", + "/wd4190", + "/wd4624", + "/wd4067", + "/wd4068", + "/EHsc", +] + +MSVC_IGNORE_CUDAFE_WARNINGS = [ + "base_class_has_different_dll_interface", + "field_without_dll_interface", + "dll_interface_conflict_none_assumed", + "dll_interface_conflict_dllexport_assumed", +] + +COMMON_NVCC_FLAGS = [ + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + "--expt-relaxed-constexpr", +] + +COMMON_HIP_FLAGS = [ + "-fPIC", + "-D__HIP_PLATFORM_AMD__=1", + "-DUSE_ROCM=1", + "-DHIPBLAS_V2", +] + +COMMON_HIPCC_FLAGS = [ + "-DCUDA_HAS_FP16=1", + "-D__HIP_NO_HALF_OPERATORS__=1", + "-D__HIP_NO_HALF_CONVERSIONS__=1", + "-mcmodel=large", + "-fno-unique-section-names", + "-ffunction-sections", + "-fdata-sections", +] + +if not int(os.environ.get("AITER_SYMBOL_VISIBLE", "0")): + COMMON_HIPCC_FLAGS.extend(["-fvisibility=hidden", "-fvisibility-inlines-hidden"]) + +JIT_EXTENSION_VERSIONER = ExtensionVersioner() + +PLAT_TO_VCVARS = { + "win32": "x86", + "win-amd64": "x86_amd64", +} + + +def get_cxx_compiler(): + return os.environ.get("CXX", "c++") + + +def _is_binary_build() -> bool: + import torch + + return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__) + + +def _accepted_compilers_for_platform() -> List[str]: + # gnu-c++ and gnu-cc are the conda gcc compilers + return ["g++", "gcc", "gnu-c++", "gnu-cc", "clang++", "clang"] + + +def _maybe_write(filename, new_content): + r""" + Equivalent to writing the content into the file but will not touch the file + if it already had the right content (to avoid triggering recompile). + """ + if os.path.exists(filename): + with open(filename) as f: + content = f.read() + + if content == new_content: + # The file already contains the right thing! + return + + with open(filename, "w") as source_file: + source_file.write(new_content) + + +def check_compiler_ok_for_platform(compiler: str) -> bool: + """ + Verify that the compiler is the expected one for the current platform. + + Args: + compiler (str): The compiler executable to check. + + Returns: + True if the compiler is gcc/g++ on Linux or clang/clang++ on macOS, + and always True for Windows. + """ + compiler_path = os.path.realpath(shutil.which(compiler)) + if not compiler_path: + return False + + # Check the compiler name + if any(name in compiler_path for name in _accepted_compilers_for_platform()): + return True + # If compiler wrapper is used try to infer the actual compiler by invoking it with -v flag + env = os.environ.copy() + env["LC_ALL"] = "C" # Don't localize output + version_string = subprocess.check_output( + [compiler, "-v"], stderr=subprocess.STDOUT, env=env + ).decode(*SUBPROCESS_DECODE_ARGS) + if IS_LINUX: + # Check for 'gcc' or 'g++' for sccache wrapper + pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE) + results = re.findall(pattern, version_string) + if len(results) != 1: + # Clang is also a supported compiler on Linux + # Though on Ubuntu it's sometimes called "Ubuntu clang version" + return "clang version" in version_string + compiler_path = os.path.realpath(results[0].strip()) + # On RHEL/CentOS c++ is a gcc compiler wrapper + if os.path.basename(compiler_path) == "c++" and "gcc version" in version_string: + return True + return any(name in compiler_path for name in _accepted_compilers_for_platform()) + return False + + +def get_compiler_abi_compatibility_and_version( + compiler, torch_exclude=False +) -> Tuple[bool, Version]: + """ + Determine if the given compiler is ABI-compatible with PyTorch alongside its version. + + Args: + compiler (str): The compiler executable name to check (e.g. ``g++``). + Must be executable in a shell process. + + Returns: + A tuple that contains a boolean that defines if the compiler is (likely) ABI-incompatible with PyTorch, + followed by a `Version` string that contains the compiler version separated by dots. + """ + if not torch_exclude: + if not _is_binary_build(): + return (True, Version("0.0.0")) + if os.environ.get("TORCH_DONT_CHECK_COMPILER_ABI") in [ + "ON", + "1", + "YES", + "TRUE", + "Y", + ]: + return (True, Version("0.0.0")) + + # First check if the compiler is one of the expected ones for the particular platform. + if not check_compiler_ok_for_platform(compiler): + warnings.warn( + WRONG_COMPILER_WARNING.format( + user_compiler=compiler, + pytorch_compiler=_accepted_compilers_for_platform()[0], + platform=sys.platform, + ) + ) + return (False, Version("0.0.0")) + + try: + if IS_LINUX: + minimum_required_version = MINIMUM_GCC_VERSION + versionstr = subprocess.check_output( + [compiler, "-dumpfullversion", "-dumpversion"] + ) + match = re.search( + r"(\d+)\.(\d+)\.(\d+)", + versionstr.decode(*SUBPROCESS_DECODE_ARGS).strip(), + ) + version = ["0", "0", "0"] if match is None else list(match.groups()) + except Exception: + _, error, _ = sys.exc_info() + warnings.warn(f"Error checking compiler version for {compiler}: {error}") + return (False, Version("0.0.0")) + + if tuple(map(int, version)) >= minimum_required_version: + return (True, Version(".".join(version))) + + compiler = f'{compiler} {".".join(version)}' + warnings.warn(ABI_INCOMPATIBILITY_WARNING.format(compiler)) + + return (False, Version(".".join(version))) + + +class BuildExtension(build_ext): + """ + A custom :mod:`setuptools` build extension . + + This :class:`setuptools.build_ext` subclass takes care of passing the + minimum required compiler flags (e.g. ``-std=c++20``) as well as mixed + C++/CUDA compilation (and support for CUDA files in general). + + When using :class:`BuildExtension`, it is allowed to supply a dictionary + for ``extra_compile_args`` (rather than the usual list) that maps from + languages (``cxx`` or ``nvcc``) to a list of additional compiler flags to + supply to the compiler. This makes it possible to supply different flags to + the C++ and CUDA compiler during mixed compilation. + + ``use_ninja`` (bool): If ``use_ninja`` is ``True`` (default), then we + attempt to build using the Ninja backend. Ninja greatly speeds up + compilation compared to the standard ``setuptools.build_ext``. + Fallbacks to the standard distutils backend if Ninja is not available. + + .. note:: + By default, the Ninja backend uses #CPUS + 2 workers to build the + extension. This may use up too many resources on some systems. One + can control the number of workers by setting the `MAX_JOBS` environment + variable to a non-negative number. + """ + + @classmethod + def with_options(cls, **options): + """Return a subclass with alternative constructor that extends any original keyword arguments to the original constructor with the given options.""" + + class cls_with_options(cls): # type: ignore[misc, valid-type] + def __init__(self, *args, **kwargs): + kwargs.update(options) + super().__init__(*args, **kwargs) + + return cls_with_options + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.no_python_abi_suffix = kwargs.get("no_python_abi_suffix", False) + + self.use_ninja = kwargs.get("use_ninja", True) + if self.use_ninja: + # Test if we can use ninja. Fallback otherwise. + msg = ( + "Attempted to use ninja as the BuildExtension backend but " + "{}. Falling back to using the slow distutils backend." + ) + if not is_ninja_available(): + warnings.warn(msg.format("we could not find ninja.")) + self.use_ninja = False + + def finalize_options(self) -> None: + super().finalize_options() + if self.use_ninja: + self.force = True + + def build_extensions(self) -> None: + import torch + + cuda_ext = False + extension_iter = iter(self.extensions) + extension = next(extension_iter, None) + while not cuda_ext and extension: + for source in extension.sources: + _, ext = os.path.splitext(source) + if ext == ".cu": + cuda_ext = True + break + extension = next(extension_iter, None) + + for extension in self.extensions: + # Ensure at least an empty list of flags for 'cxx' and 'nvcc' when + # extra_compile_args is a dict. Otherwise, default torch flags do + # not get passed. Necessary when only one of 'cxx' and 'nvcc' is + # passed to extra_compile_args in CUDAExtension, i.e. + # CUDAExtension(..., extra_compile_args={'cxx': [...]}) + # or + # CUDAExtension(..., extra_compile_args={'nvcc': [...]}) + if isinstance(extension.extra_compile_args, dict): + for ext in ["cxx", "nvcc"]: + if ext not in extension.extra_compile_args: + extension.extra_compile_args[ext] = [] + + self._add_compile_flag(extension, "-DTORCH_API_INCLUDE_EXTENSION_H") + # See note [Pybind11 ABI constants] + for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]: + val = getattr(torch._C, f"_PYBIND11_{name}") + if val is not None: + self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"') + self._define_torch_extension_name(extension) + self._add_gnu_cpp_abi_flag(extension) + + if "nvcc_dlink" in extension.extra_compile_args: + assert ( + self.use_ninja + ), f"With dlink=True, ninja is required to build cuda extension {extension.name}." + + # Register .cu, .cuh, .hip, and .mm as valid source extensions. + self.compiler.src_extensions += [".cu", ".cuh", ".hip"] + if torch.backends.mps.is_built(): + self.compiler.src_extensions += [".mm"] + # Save the original _compile method for later. + if self.compiler.compiler_type == "msvc": + self.compiler._cpp_extensions += [".cu", ".cuh"] + original_compile = self.compiler.compile + original_spawn = self.compiler.spawn + else: + original_compile = self.compiler._compile + + def append_std17_if_no_std_present(cflags) -> None: + # NVCC does not allow multiple -std to be passed, so we avoid + # overriding the option if the user explicitly passed it. + cpp_format_prefix = ( + "/{}:" if self.compiler.compiler_type == "msvc" else "-{}=" + ) + cpp_flag_prefix = cpp_format_prefix.format("std") + cpp_flag = cpp_flag_prefix + "c++20" + if not any(flag.startswith(cpp_flag_prefix) for flag in cflags): + cflags.append(cpp_flag) + + # NVCC does not allow multiple -ccbin/--compiler-bindir to be passed, so we avoid + # overriding the option if the user explicitly passed it. + _ccbin = os.getenv("CC") + if _ccbin is not None and not any( + flag.startswith(("-ccbin", "--compiler-bindir")) for flag in cflags + ): + cflags.extend(["-ccbin", _ccbin]) + + return cflags + + def convert_to_absolute_paths_inplace(paths): + # Helper function. See Note [Absolute include_dirs] + if paths is not None: + for i in range(len(paths)): + if not os.path.isabs(paths[i]): + paths[i] = os.path.abspath(paths[i]) + + def unix_wrap_single_compile( + obj, src, ext, cc_args, extra_postargs, pp_opts + ) -> None: + # Copy before we make any modifications. + cflags = copy.deepcopy(extra_postargs) + try: + original_compiler = self.compiler.compiler_so + if _is_cuda_file(src): + nvcc = [executable_path("hipcc")] + self.compiler.set_executable("compiler_so", nvcc) + if isinstance(cflags, dict): + cflags = cflags["nvcc"] + cflags = COMMON_HIPCC_FLAGS + cflags + _get_rocm_arch_flags(cflags) + + elif isinstance(cflags, dict): + cflags = cflags["cxx"] + if IS_HIP_EXTENSION: + cflags = COMMON_HIP_FLAGS + cflags + append_std17_if_no_std_present(cflags) + + original_compile(obj, src, ext, cc_args, cflags, pp_opts) + finally: + # Put the original compiler back in place. + self.compiler.set_executable("compiler_so", original_compiler) + + def unix_wrap_ninja_compile( + sources, + output_dir=None, + macros=None, + include_dirs=None, + debug=0, + extra_preargs=None, + extra_postargs=None, + depends=None, + ): + r"""Compiles sources by outputting a ninja file and running it.""" + # NB: I copied some lines from self.compiler (which is an instance + # of distutils.UnixCCompiler). See the following link. + # https://github.com/python/cpython/blob/f03a8f8d5001963ad5b5b28dbd95497e9cc15596/Lib/distutils/ccompiler.py#L564-L567 + # This can be fragile, but a lot of other repos also do this + # (see https://github.com/search?q=_setup_compile&type=Code) + # so it is probably OK; we'll also get CI signal if/when + # we update our python version (which is when distutils can be + # upgraded) + + # Use absolute path for output_dir so that the object file paths + # (`objects`) get generated with absolute paths. + output_dir = os.path.abspath(output_dir) + + # See Note [Absolute include_dirs] + convert_to_absolute_paths_inplace(self.compiler.include_dirs) + + _, objects, extra_postargs, pp_opts, _ = self.compiler._setup_compile( + output_dir, macros, include_dirs, sources, depends, extra_postargs + ) + common_cflags = self.compiler._get_cc_args(pp_opts, debug, extra_preargs) + extra_cc_cflags = self.compiler.compiler_so[1:] + with_cuda = any(map(_is_cuda_file, sources)) + + # extra_postargs can be either: + # - a dict mapping cxx/nvcc to extra flags + # - a list of extra flags. + if isinstance(extra_postargs, dict): + post_cflags = extra_postargs["cxx"] + else: + post_cflags = list(extra_postargs) + if IS_HIP_EXTENSION: + post_cflags = COMMON_HIP_FLAGS + post_cflags + append_std17_if_no_std_present(post_cflags) + + cuda_post_cflags = None + cuda_cflags = None + if with_cuda: + cuda_cflags = common_cflags + if isinstance(extra_postargs, dict): + cuda_post_cflags = extra_postargs["nvcc"] + else: + cuda_post_cflags = list(extra_postargs) + cuda_post_cflags = cuda_post_cflags + _get_rocm_arch_flags( + cuda_post_cflags + ) + cuda_post_cflags = ( + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS + cuda_post_cflags + ) + + append_std17_if_no_std_present(cuda_post_cflags) + cuda_cflags = [shlex.quote(f) for f in cuda_cflags] + cuda_post_cflags = [shlex.quote(f) for f in cuda_post_cflags] + + _write_ninja_file_and_compile_objects( + sources=sources, + objects=objects, + cflags=[shlex.quote(f) for f in extra_cc_cflags + common_cflags], + post_cflags=[shlex.quote(f) for f in post_cflags], + cuda_cflags=cuda_cflags, + cuda_post_cflags=cuda_post_cflags, + cuda_dlink_post_cflags=None, + build_directory=output_dir, + verbose=True, + with_cuda=with_cuda, + ) + + # Return *all* object filenames, not just the ones we just built. + return objects + + # Monkey-patch the _compile or compile method. + # https://github.com/python/cpython/blob/dc0284ee8f7a270b6005467f26d8e5773d76e959/Lib/distutils/ccompiler.py#L511 + if self.compiler.compiler_type == "msvc": + print("currently only support unix") + # if self.use_ninja: + # self.compiler.compile = win_wrap_ninja_compile + # else: + # self.compiler.compile = win_wrap_single_compile + else: + if self.use_ninja: + self.compiler.compile = unix_wrap_ninja_compile + else: + self.compiler._compile = unix_wrap_single_compile + + build_ext.build_extensions(self) + + def get_ext_filename(self, ext_name): + # Get the original shared library name. For Python 3, this name will be + # suffixed with ".so", where will be something like + # cpython-37m-x86_64-linux-gnu. + ext_filename = super().get_ext_filename(ext_name) + # If `no_python_abi_suffix` is `True`, we omit the Python 3 ABI + # component. This makes building shared libraries with setuptools that + # aren't Python modules nicer. + if self.no_python_abi_suffix: + # The parts will be e.g. ["my_extension", "cpython-37m-x86_64-linux-gnu", "so"]. + ext_filename_parts = ext_filename.split(".") + # Omit the second to last element. + without_abi = ext_filename_parts[:-2] + ext_filename_parts[-1:] + ext_filename = ".".join(without_abi) + return ext_filename + + def _add_compile_flag(self, extension, flag): + extension.extra_compile_args = copy.deepcopy(extension.extra_compile_args) + if isinstance(extension.extra_compile_args, dict): + for args in extension.extra_compile_args.values(): + args.append(flag) + else: + extension.extra_compile_args.append(flag) + + def _define_torch_extension_name(self, extension): + # pybind11 doesn't support dots in the names + # so in order to support extensions in the packages + # like torch._C, we take the last part of the string + # as the library name + names = extension.name.split(".") + name = names[-1] + define = f"-DTORCH_EXTENSION_NAME={name}" + self._add_compile_flag(extension, define) + + def _add_gnu_cpp_abi_flag(self, extension): + import torch + + # use the same CXX ABI as what PyTorch was compiled with + self._add_compile_flag( + extension, + "-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI)), + ) + + +def CppExtension(name, sources, *args, **kwargs): + """ + Create a :class:`setuptools.Extension` for C++. + + Convenience method that creates a :class:`setuptools.Extension` with the + bare minimum (but often sufficient) arguments to build a C++ extension. + + All arguments are forwarded to the :class:`setuptools.Extension` + constructor. Full list arguments can be found at + https://setuptools.pypa.io/en/latest/userguide/ext_modules.html#extension-api-reference + + Example: + >>> # xdoctest: +SKIP + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT) + >>> from setuptools import setup + >>> from torch.utils.cpp_extension import BuildExtension, CppExtension + >>> setup( + ... name='extension', + ... ext_modules=[ + ... CppExtension( + ... name='extension', + ... sources=['extension.cpp'], + ... extra_compile_args=['-g'], + ... extra_link_flags=['-Wl,--no-as-needed', '-lm']) + ... ], + ... cmdclass={ + ... 'build_ext': BuildExtension + ... }) + """ + include_dirs = kwargs.get("include_dirs", []) + include_dirs += include_paths() + kwargs["include_dirs"] = include_dirs + + library_dirs = kwargs.get("library_dirs", []) + library_dirs += library_paths() + kwargs["library_dirs"] = library_dirs + + libraries = kwargs.get("libraries", []) + libraries.append("c10") + libraries.append("torch") + libraries.append("torch_cpu") + libraries.append("torch_python") + + kwargs["libraries"] = libraries + + kwargs["language"] = "c++" + return setuptools.Extension(name, sources, *args, **kwargs) + + +def CUDAExtension(name, sources, *args, **kwargs): + """ + Create a :class:`setuptools.Extension` for CUDA/C++. + + Convenience method that creates a :class:`setuptools.Extension` with the + bare minimum (but often sufficient) arguments to build a CUDA/C++ + extension. This includes the CUDA include path, library path and runtime + library. + + All arguments are forwarded to the :class:`setuptools.Extension` + constructor. Full list arguments can be found at + https://setuptools.pypa.io/en/latest/userguide/ext_modules.html#extension-api-reference + + Example: + >>> # xdoctest: +SKIP + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT) + >>> from setuptools import setup + >>> from torch.utils.cpp_extension import BuildExtension, CUDAExtension + >>> setup( + ... name='cuda_extension', + ... ext_modules=[ + ... CUDAExtension( + ... name='cuda_extension', + ... sources=['extension.cpp', 'extension_kernel.cu'], + ... extra_compile_args={'cxx': ['-g'], + ... 'nvcc': ['-O2']}, + ... extra_link_flags=['-Wl,--no-as-needed', '-lcuda']) + ... ], + ... cmdclass={ + ... 'build_ext': BuildExtension + ... }) + + Compute capabilities: + + By default the extension will be compiled to run on all archs of the cards visible during the + building process of the extension, plus PTX. If down the road a new card is installed the + extension may need to be recompiled. If a visible card has a compute capability (CC) that's + newer than the newest version for which your nvcc can build fully-compiled binaries, Pytorch + will make nvcc fall back to building kernels with the newest version of PTX your nvcc does + support (see below for details on PTX). + + You can override the default behavior using `TORCH_CUDA_ARCH_LIST` to explicitly specify which + CCs you want the extension to support: + + ``TORCH_CUDA_ARCH_LIST="6.1 8.6" python build_my_extension.py`` + ``TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" python build_my_extension.py`` + + The +PTX option causes extension kernel binaries to include PTX instructions for the specified + CC. PTX is an intermediate representation that allows kernels to runtime-compile for any CC >= + the specified CC (for example, 8.6+PTX generates PTX that can runtime-compile for any GPU with + CC >= 8.6). This improves your binary's forward compatibility. However, relying on older PTX to + provide forward compat by runtime-compiling for newer CCs can modestly reduce performance on + those newer CCs. If you know exact CC(s) of the GPUs you want to target, you're always better + off specifying them individually. For example, if you want your extension to run on 8.0 and 8.6, + "8.0+PTX" would work functionally because it includes PTX that can runtime-compile for 8.6, but + "8.0 8.6" would be better. + + Note that while it's possible to include all supported archs, the more archs get included the + slower the building process will be, as it will build a separate kernel image for each arch. + + Note that CUDA-11.5 nvcc will hit internal compiler error while parsing torch/extension.h on Windows. + To workaround the issue, move python binding logic to pure C++ file. + + Example use: + #include + at::Tensor SigmoidAlphaBlendForwardCuda(....) + + Instead of: + #include + torch::Tensor SigmoidAlphaBlendForwardCuda(...) + + Currently open issue for nvcc bug: https://github.com/pytorch/pytorch/issues/69460 + Complete workaround code example: https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48 + + Relocatable device code linking: + + If you want to reference device symbols across compilation units (across object files), + the object files need to be built with `relocatable device code` (-rdc=true or -dc). + An exception to this rule is "dynamic parallelism" (nested kernel launches) which is not used a lot anymore. + `Relocatable device code` is less optimized so it needs to be used only on object files that need it. + Using `-dlto` (Device Link Time Optimization) at the device code compilation step and `dlink` step + help reduce the protentional perf degradation of `-rdc`. + Note that it needs to be used at both steps to be useful. + + If you have `rdc` objects you need to have an extra `-dlink` (device linking) step before the CPU symbol linking step. + There is also a case where `-dlink` is used without `-rdc`: + when an extension is linked against a static lib containing rdc-compiled objects + like the [SHMEM library]. + + Note: Ninja is required to build a CUDA Extension with RDC linking. + + Example: + >>> # xdoctest: +SKIP + >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT) + >>> CUDAExtension( + ... name='cuda_extension', + ... sources=['extension.cpp', 'extension_kernel.cu'], + ... dlink=True, + ... dlink_libraries=["dlink_lib"], + ... extra_compile_args={'cxx': ['-g'], + ... 'nvcc': ['-O2', '-rdc=true']}) + """ + library_dirs = kwargs.get("library_dirs", []) + library_dirs += library_paths(cuda=True) + kwargs["library_dirs"] = library_dirs + + libraries = kwargs.get("libraries", []) + libraries.append("c10") + libraries.append("torch") + libraries.append("torch_cpu") + libraries.append("torch_python") + if IS_HIP_EXTENSION: + libraries.append("amdhip64") + libraries.append("c10_hip") + libraries.append("torch_hip") + else: + libraries.append("cudart") + libraries.append("c10_cuda") + libraries.append("torch_cuda") + kwargs["libraries"] = libraries + + include_dirs = kwargs.get("include_dirs", []) + + if IS_HIP_EXTENSION: + build_dir = os.getcwd() + hipify_result = hipify_python.hipify( + project_directory=build_dir, + output_directory=build_dir, + header_include_dirs=include_dirs, + includes=[os.path.join(build_dir, "*")], # limit scope to build_dir only + extra_files=[os.path.abspath(s) for s in sources], + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True, # don't hipify everything in includes path + ) + + hipified_sources = set() + for source in sources: + s_abs = os.path.abspath(source) + hipified_s_abs = ( + hipify_result[s_abs].hipified_path + if ( + s_abs in hipify_result + and hipify_result[s_abs].hipified_path is not None + ) + else s_abs + ) + # setup() arguments must *always* be /-separated paths relative to the setup.py directory, + # *never* absolute paths + hipified_sources.add(os.path.relpath(hipified_s_abs, build_dir)) + + sources = list(hipified_sources) + + include_dirs += include_paths(cuda=True) + kwargs["include_dirs"] = include_dirs + + kwargs["language"] = "c++" + + dlink_libraries = kwargs.get("dlink_libraries", []) + dlink = kwargs.get("dlink", False) or dlink_libraries + if dlink: + extra_compile_args = kwargs.get("extra_compile_args", {}) + + extra_compile_args_dlink = extra_compile_args.get("nvcc_dlink", []) + extra_compile_args_dlink += ["-dlink"] + extra_compile_args_dlink += [f"-L{x}" for x in library_dirs] + extra_compile_args_dlink += [f"-l{x}" for x in dlink_libraries] + + extra_compile_args["nvcc_dlink"] = extra_compile_args_dlink + + kwargs["extra_compile_args"] = extra_compile_args + + return setuptools.Extension(name, sources, *args, **kwargs) + + +def include_paths(cuda: bool = False) -> List[str]: + """ + Get the include paths required to build a C++ or CUDA extension. + + Args: + cuda: If `True`, includes CUDA-specific include paths. + + Returns: + A list of include path strings. + """ + import torch + + _TORCH_PATH = os.path.join(os.path.dirname(torch.__file__)) + lib_include = os.path.join(_TORCH_PATH, "include") + paths = [ + lib_include, + # Remove this once torch/torch.h is officially no longer supported for C++ extensions. + os.path.join(lib_include, "torch", "csrc", "api", "include"), + # Some internal (old) Torch headers don't properly prefix their includes, + # so we need to pass -Itorch/lib/include/TH as well. + os.path.join(lib_include, "TH"), + os.path.join(lib_include, "THC"), + ] + if cuda and IS_HIP_EXTENSION: + paths.append(os.path.join(lib_include, "THH")) + paths.append(_join_rocm_home("include")) + return paths + + +def library_paths(cuda: bool = False) -> List[str]: + """ + Get the library paths required to build a C++ or CUDA extension. + + Args: + cuda: If `True`, includes CUDA-specific library paths. + + Returns: + A list of library path strings. + """ + # We need to link against libtorch.so + import torch + + _TORCH_PATH = os.path.join(os.path.dirname(torch.__file__)) + TORCH_LIB_PATH = os.path.join(_TORCH_PATH, "lib") + paths = [TORCH_LIB_PATH] + + if cuda and IS_HIP_EXTENSION: + lib_dir = "lib" + paths.append(_join_rocm_home(lib_dir)) + if HIP_HOME is not None: + paths.append(os.path.join(HIP_HOME, "lib")) + return paths + + +def load( + name, + sources: Union[str, List[str]], + extra_cflags=None, + extra_cuda_cflags=None, + extra_ldflags=None, + extra_include_paths=None, + build_directory=None, + verbose=False, + with_cuda: Optional[bool] = None, + is_python_module=True, + is_standalone=False, + keep_intermediates=True, + torch_exclude=False, +): + """ + Load a PyTorch C++ extension just-in-time (JIT). + + To load an extension, a Ninja build file is emitted, which is used to + compile the given sources into a dynamic library. This library is + subsequently loaded into the current Python process as a module and + returned from this function, ready for use. + + By default, the directory to which the build file is emitted and the + resulting library compiled to is ``/torch_extensions/``, where + ```` is the temporary folder on the current platform and ```` + the name of the extension. This location can be overridden in two ways. + First, if the ``TORCH_EXTENSIONS_DIR`` environment variable is set, it + replaces ``/torch_extensions`` and all extensions will be compiled + into subfolders of this directory. Second, if the ``build_directory`` + argument to this function is supplied, it overrides the entire path, i.e. + the library will be compiled into that folder directly. + + To compile the sources, the default system compiler (``c++``) is used, + which can be overridden by setting the ``CXX`` environment variable. To pass + additional arguments to the compilation process, ``extra_cflags`` or + ``extra_ldflags`` can be provided. For example, to compile your extension + with optimizations, pass ``extra_cflags=['-O3']``. You can also use + ``extra_cflags`` to pass further include directories. + + CUDA support with mixed compilation is provided. Simply pass CUDA source + files (``.cu`` or ``.cuh``) along with other sources. Such files will be + detected and compiled with nvcc rather than the C++ compiler. This includes + passing the CUDA lib64 directory as a library directory, and linking + ``cudart``. You can pass additional flags to nvcc via + ``extra_cuda_cflags``, just like with ``extra_cflags`` for C++. Various + heuristics for finding the CUDA install directory are used, which usually + work fine. If not, setting the ``CUDA_HOME`` environment variable is the + safest option. + + Args: + name: The name of the extension to build. This MUST be the same as the + name of the pybind11 module! + sources: A list of relative or absolute paths to C++ source files. + extra_cflags: optional list of compiler flags to forward to the build. + extra_cuda_cflags: optional list of compiler flags to forward to nvcc + when building CUDA sources. + extra_ldflags: optional list of linker flags to forward to the build. + extra_include_paths: optional list of include directories to forward + to the build. + build_directory: optional path to use as build workspace. + verbose: If ``True``, turns on verbose logging of load steps. + with_cuda: Determines whether CUDA headers and libraries are added to + the build. If set to ``None`` (default), this value is + automatically determined based on the existence of ``.cu`` or + ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers + and libraries to be included. + is_python_module: If ``True`` (default), imports the produced shared + library as a Python module. If ``False``, behavior depends on + ``is_standalone``. + is_standalone: If ``False`` (default) loads the constructed extension + into the process as a plain dynamic library. If ``True``, build a + standalone executable. + + Returns: + If ``is_python_module`` is ``True``: + Returns the loaded PyTorch extension as a Python module. + + If ``is_python_module`` is ``False`` and ``is_standalone`` is ``False``: + Returns nothing. (The shared library is loaded into the process as + a side effect.) + + If ``is_standalone`` is ``True``. + Return the path to the executable. (On Windows, TORCH_LIB_PATH is + added to the PATH environment variable as a side effect.) + + Example: + >>> # xdoctest: +SKIP + >>> from torch.utils.cpp_extension import load + >>> module = load( + ... name='extension', + ... sources=['extension.cpp', 'extension_kernel.cu'], + ... extra_cflags=['-O2'], + ... verbose=True) + """ + return _jit_compile( + name, + [sources] if isinstance(sources, str) else sources, + extra_cflags, + extra_cuda_cflags, + extra_ldflags, + extra_include_paths, + build_directory, + verbose, + with_cuda, + is_python_module, + is_standalone, + keep_intermediates=keep_intermediates, + torch_exclude=torch_exclude, + ) + + +def _get_pybind11_abi_build_flags(): + # Note [Pybind11 ABI constants] + # + # Pybind11 before 2.4 used to build an ABI strings using the following pattern: + # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_BUILD_TYPE}__" + # Since 2.4 compier type, stdlib and build abi parameters are also encoded like this: + # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_COMPILER_TYPE}{PYBIND11_STDLIB}{PYBIND11_BUILD_ABI}{PYBIND11_BUILD_TYPE}__" + # + # This was done in order to further narrow down the chances of compiler ABI incompatibility + # that can cause a hard to debug segfaults. + # For PyTorch extensions we want to relax those restrictions and pass compiler, stdlib and abi properties + # captured during PyTorch native library compilation in torch/csrc/Module.cpp + import torch + + abi_cflags = [] + for pname in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]: + pval = getattr(torch._C, f"_PYBIND11_{pname}", None) + if pval is not None: + abi_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"') + return abi_cflags + + +def _get_glibcxx_abi_build_flags(): + import torch + + glibcxx_abi_cflags = [ + "-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI)) + ] + return glibcxx_abi_cflags + + +def check_compiler_is_gcc(compiler): + if not IS_LINUX: + return False + + env = os.environ.copy() + env["LC_ALL"] = "C" # Don't localize output + try: + version_string = subprocess.check_output( + [compiler, "-v"], stderr=subprocess.STDOUT, env=env + ).decode(*SUBPROCESS_DECODE_ARGS) + except Exception: + try: + version_string = subprocess.check_output( + [compiler, "--version"], stderr=subprocess.STDOUT, env=env + ).decode(*SUBPROCESS_DECODE_ARGS) + except Exception: + return False + # Check for 'gcc' or 'g++' for sccache wrapper + pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE) + results = re.findall(pattern, version_string) + if len(results) != 1: + return False + compiler_path = os.path.realpath(results[0].strip()) + # On RHEL/CentOS c++ is a gcc compiler wrapper + if os.path.basename(compiler_path) == "c++" and "gcc version" in version_string: + return True + return False + + +def _jit_compile( + name, + sources, + extra_cflags, + extra_cuda_cflags, + extra_ldflags, + extra_include_paths, + build_directory: str, + verbose: bool, + with_cuda: Optional[bool], + is_python_module, + is_standalone, + keep_intermediates=True, + torch_exclude=False, + hipify=True, +) -> None: + if is_python_module and is_standalone: + raise ValueError( + "`is_python_module` and `is_standalone` are mutually exclusive." + ) + + if with_cuda is None: + with_cuda = any(map(_is_cuda_file, sources)) + old_version = JIT_EXTENSION_VERSIONER.get_version(name) + version = JIT_EXTENSION_VERSIONER.bump_version_if_changed( + name, + sources, + build_arguments=[ + extra_cflags, + extra_cuda_cflags, + extra_ldflags, + extra_include_paths, + ], + build_directory=build_directory, + with_cuda=with_cuda, + is_python_module=is_python_module, + is_standalone=is_standalone, + ) + if version > 0: + if version != old_version and verbose: + print( + f"The input conditions for extension module {name} have changed. " + + f"Bumping to version {version} and re-building as {name}_v{version}...", + file=sys.stderr, + ) + name = f"{name}_v{version}" + + baton = FileBaton(os.path.join(build_directory, "lock")) + if baton.try_acquire(): + try: + if version != old_version: + with GeneratedFileCleaner( + keep_intermediates=keep_intermediates + ) as clean_ctx: + torch_path = os.path.join("") + if not torch_exclude: + import torch + + _TORCH_PATH = os.path.join(os.path.dirname(torch.__file__)) + torch_path = os.path.join(_TORCH_PATH, "*") + + if IS_HIP_EXTENSION and with_cuda and hipify: + hipify_result = hipify_python.hipify( + project_directory=build_directory, + output_directory=build_directory, + header_include_dirs=( + extra_include_paths + if extra_include_paths is not None + else [] + ), + extra_files=[os.path.abspath(s) for s in sources], + ignores=[ + _join_rocm_home("*"), + torch_path, + ], # no need to hipify ROCm or PyTorch headers + show_detailed=verbose, + show_progress=verbose, + is_pytorch_extension=True, + hipify_extra_files_only=True, # don't hipify everything in includes path + clean_ctx=clean_ctx, + ) + + hipified_sources = set() + for source in sources: + s_abs = os.path.abspath(source) + hipified_sources.add( + hipify_result[s_abs].hipified_path + if s_abs in hipify_result + else s_abs + ) + + sources = list(hipified_sources) + + _write_ninja_file_and_build_library( + name=name, + sources=sources, + extra_cflags=extra_cflags or [], + extra_cuda_cflags=extra_cuda_cflags or [], + extra_ldflags=extra_ldflags or [], + extra_include_paths=extra_include_paths or [], + build_directory=build_directory, + verbose=verbose, + with_cuda=with_cuda, + is_python_module=is_python_module, + is_standalone=is_standalone, + torch_exclude=torch_exclude, + ) + elif verbose: + print( + "No modifications detected for re-loaded extension " + f"module {name}, skipping build step...", + file=sys.stderr, + ) + finally: + baton.release() + else: + baton.wait() + + if verbose: + print(f"Loading extension module {name}...", file=sys.stderr) + + if is_standalone: + return _get_exec_path(name, build_directory) + + return _import_module_from_library( + name, build_directory, is_python_module, torch_exclude + ) + + +def _write_ninja_file_and_compile_objects( + sources: List[str], + objects, + cflags, + post_cflags, + cuda_cflags, + cuda_post_cflags, + cuda_dlink_post_cflags, + build_directory: str, + verbose: bool, + with_cuda: Optional[bool], +) -> None: + verify_ninja_availability() + + compiler = get_cxx_compiler() + + get_compiler_abi_compatibility_and_version(compiler) + if with_cuda is None: + with_cuda = any(map(_is_cuda_file, sources)) + build_file_path = os.path.join(build_directory, "build.ninja") + if verbose: + print(f"Emitting ninja build file {build_file_path}...", file=sys.stderr) + _write_ninja_file( + path=build_file_path, + cflags=cflags, + post_cflags=post_cflags, + cuda_cflags=cuda_cflags, + cuda_post_cflags=cuda_post_cflags, + cuda_dlink_post_cflags=cuda_dlink_post_cflags, + sources=sources, + objects=objects, + ldflags=None, + library_target=None, + with_cuda=with_cuda, + ) + if verbose: + print("Compiling objects...", file=sys.stderr) + _run_ninja_build( + build_directory, + verbose, + # It would be better if we could tell users the name of the extension + # that failed to build but there isn't a good way to get it here. + error_prefix="Error compiling objects for extension", + ) + + +def _write_ninja_file_and_build_library( + name, + sources: List[str], + extra_cflags, + extra_cuda_cflags, + extra_ldflags, + extra_include_paths, + build_directory: str, + verbose: bool, + with_cuda: Optional[bool], + is_python_module: bool, + is_standalone: bool = False, + torch_exclude: bool = False, +) -> None: + verify_ninja_availability() + + compiler = get_cxx_compiler() + get_compiler_abi_compatibility_and_version(compiler, torch_exclude) + if with_cuda is None: + with_cuda = any(map(_is_cuda_file, sources)) + extra_ldflags = _prepare_ldflags( + extra_ldflags or [], with_cuda, verbose, is_standalone, torch_exclude + ) + build_file_path = os.path.join(build_directory, "build.ninja") + if verbose: + print(f"Emitting ninja build file {build_file_path}...", file=sys.stderr) + # NOTE: Emitting a new ninja build file does not cause re-compilation if + # the sources did not change, so it's ok to re-emit (and it's fast). + _write_ninja_file_to_build_library( + path=build_file_path, + name=name, + sources=sorted(set(sources)), + extra_cflags=extra_cflags or [], + extra_cuda_cflags=extra_cuda_cflags or [], + extra_ldflags=extra_ldflags or [], + extra_include_paths=extra_include_paths or [], + with_cuda=with_cuda, + is_python_module=is_python_module, + is_standalone=is_standalone, + torch_exclude=torch_exclude, + ) + + if verbose: + print(f"Building extension module {name}...", file=sys.stderr) + _run_ninja_build( + build_directory, verbose, error_prefix=f"Error building extension '{name}'" + ) + + +def is_ninja_available(): + """Return ``True`` if the `ninja `_ build system is available on the system, ``False`` otherwise.""" + try: + subprocess.check_output("ninja --version".split()) + except Exception: + return False + else: + return True + + +def verify_ninja_availability(): + """Raise ``RuntimeError`` if `ninja `_ build system is not available on the system, does nothing otherwise.""" + if not is_ninja_available(): + raise RuntimeError("Ninja is required to load C++ extensions") + + +def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone, torch_exclude): + extra_ldflags.append("-mcmodel=large") + extra_ldflags.append("-ffunction-sections") + extra_ldflags.append("-fdata-sections ") + extra_ldflags.append("-Wl,--gc-sections") + extra_ldflags.append("-Wl,--cref") + if not torch_exclude: + import torch + + _TORCH_PATH = os.path.join(os.path.dirname(torch.__file__)) + TORCH_LIB_PATH = os.path.join(_TORCH_PATH, "lib") + extra_ldflags.append(f"-L{TORCH_LIB_PATH}") + extra_ldflags.append("-lc10") + if with_cuda: + extra_ldflags.append("-lc10_hip" if IS_HIP_EXTENSION else "-lc10_cuda") + extra_ldflags.append("-ltorch_cpu") + if with_cuda: + extra_ldflags.append("-ltorch_hip" if IS_HIP_EXTENSION else "-ltorch_cuda") + extra_ldflags.append("-ltorch") + if not is_standalone: + extra_ldflags.append("-ltorch_python") + + if is_standalone: + extra_ldflags.append(f"-Wl,-rpath,{TORCH_LIB_PATH}") + + if with_cuda and IS_HIP_EXTENSION: + if verbose: + print("Detected CUDA files, patching ldflags", file=sys.stderr) + + extra_ldflags.append(f'-L{_join_rocm_home("lib")}') + extra_ldflags.append("-lamdhip64") + return extra_ldflags + + +def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: + # If cflags is given, there may already be user-provided arch flags in it + # (from `extra_compile_args`) + if cflags is not None: + for flag in cflags: + if "amdgpu-target" in flag or "offload-arch" in flag: + return ["-fno-gpu-rdc"] + # Use same defaults as used for building PyTorch + # Allow env var to override, just like during initial cmake build. + _archs = os.environ.get("PYTORCH_ROCM_ARCH", None) + if not _archs: + import torch + + archFlags = torch._C._cuda_getArchFlags() + if archFlags: + archs = archFlags.split() + else: + archs = [] + else: + archs = _archs.replace(" ", ";").split(";") + flags = [f"--offload-arch={arch}" for arch in archs] + flags += ["-fno-gpu-rdc"] + return flags + + +def _get_num_workers(verbose: bool) -> Optional[int]: + max_jobs = os.environ.get("MAX_JOBS") + if max_jobs is not None and max_jobs.isdigit(): + if int(max_jobs) > int(max(1, os.cpu_count() * 0.8)): + max_jobs = int(max(1, os.cpu_count() * 0.8)) + if verbose: + print( + f"Using envvar MAX_JOBS ({max_jobs}) as the number of workers...", + file=sys.stderr, + ) + else: + max_jobs = int(max(1, os.cpu_count() * 0.8)) + print( + f"Using 0.8*cpu_cnt MAX_JOBS ({max_jobs}) as the number of workers...", + file=sys.stderr, + ) + prebuild_thread_num = os.environ.get("PREBUILD_THREAD_NUM") + if prebuild_thread_num is not None: + max_jobs = int(max_jobs) / int(prebuild_thread_num) + return int(max_jobs) + + +def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> None: + command = ["ninja", "-v"] + num_workers = _get_num_workers(verbose) + if num_workers is not None: + command.extend(["-j", str(num_workers)]) + env = os.environ.copy() + + try: + sys.stdout.flush() + sys.stderr.flush() + # Warning: don't pass stdout=None to subprocess.run to get output. + # subprocess.run assumes that sys.__stdout__ has not been modified and + # attempts to write to it by default. However, when we call _run_ninja_build + # from ahead-of-time cpp extensions, the following happens: + # 1) If the stdout encoding is not utf-8, setuptools detachs __stdout__. + # https://github.com/pypa/setuptools/blob/7e97def47723303fafabe48b22168bbc11bb4821/setuptools/dist.py#L1110 + # (it probably shouldn't do this) + # 2) subprocess.run (on POSIX, with no stdout override) relies on + # __stdout__ not being detached: + # https://github.com/python/cpython/blob/c352e6c7446c894b13643f538db312092b351789/Lib/subprocess.py#L1214 + # To work around this, we pass in the fileno directly and hope that + # it is valid. + stdout_fileno = 1 + subprocess.run( + command, + stdout=stdout_fileno if verbose else subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=build_directory, + check=True, + env=env, + ) + except subprocess.CalledProcessError as e: + # Python 2 and 3 compatible way of getting the error object. + _, error, _ = sys.exc_info() + # error.output contains the stdout and stderr of the build attempt. + message = error_prefix + # `error` is a CalledProcessError (which has an `output`) attribute, but + # mypy thinks it's Optional[BaseException] and doesn't narrow + if hasattr(error, "output") and error.output: # type: ignore[union-attr] + message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}" # type: ignore[union-attr] + raise RuntimeError(message) from e + + +def _get_exec_path(module_name, path): + return os.path.join(path, f"{module_name}{EXEC_EXT}") + + +def _import_module_from_library(module_name, path, is_python_module, torch_exclude): + filepath = os.path.join(path, f"{module_name}{LIB_EXT}") + if is_python_module: + return None + # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path + spec = importlib.util.spec_from_file_location(module_name, filepath) + assert spec is not None + module = importlib.util.module_from_spec(spec) + assert isinstance(spec.loader, importlib.abc.Loader) + spec.loader.exec_module(module) + return module + else: + if not torch_exclude: + import torch + + torch.ops.load_library(filepath) + + +def _write_ninja_file_to_build_library( + path, + name, + sources, + extra_cflags, + extra_cuda_cflags, + extra_ldflags, + extra_include_paths, + with_cuda, + is_python_module, + is_standalone, + torch_exclude, +) -> None: + extra_cflags = [flag.strip() for flag in extra_cflags] + extra_cuda_cflags = [flag.strip() for flag in extra_cuda_cflags] + extra_ldflags = [flag.strip() for flag in extra_ldflags] + extra_include_paths = [flag.strip() for flag in extra_include_paths] + # include_paths() gives us the location of torch/extension.h + system_includes = [] if torch_exclude else include_paths(with_cuda) + + # FIXME: build python module excluded with torch, use `pybind11` + # But we can't use this now because all aiter op based on torch + # which means pybind11 related build flags must from torch now + common_cflags = [] + if is_python_module: + import pybind11 + + extra_include_paths.append(pybind11.get_include()) + common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()] + common_cflags += [f"{x}" for x in _get_glibcxx_abi_build_flags()] + + # sysconfig.get_path('include') gives us the location of Python.h + # Explicitly specify 'posix_prefix' scheme on non-Windows platforms to workaround error on some MacOS + # installations where default `get_path` points to non-existing `/Library/Python/M.m/include` folder + if is_python_module: + python_include_path = sysconfig.get_path("include", scheme="posix_prefix") + if python_include_path is not None: + system_includes.append(python_include_path) + + # Turn into absolute paths so we can emit them into the ninja build + # file wherever it is. + user_includes = [os.path.abspath(file) for file in extra_include_paths] + + if not torch_exclude: + common_cflags.append(f"-DTORCH_EXTENSION_NAME={name}") + common_cflags.append("-DTORCH_API_INCLUDE_EXTENSION_H") + # common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()] + # common_cflags += [f"{x}" for x in _get_glibcxx_abi_build_flags()] + + # Windows does not understand `-isystem` and quotes flags later. + common_cflags += [f"-I{shlex.quote(include)}" for include in user_includes] + common_cflags += [f"-isystem {shlex.quote(include)}" for include in system_includes] + + cflags = common_cflags + ["-fPIC", "-std=c++20"] + extra_cflags + + if with_cuda and IS_HIP_EXTENSION: + cuda_flags = ["-DWITH_HIP"] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS + cuda_flags += extra_cuda_cflags + cuda_flags += _get_rocm_arch_flags(cuda_flags) + + def object_file_path(source_file: str) -> str: + # '/path/to/file.cpp' -> 'file' + file_name = os.path.splitext(os.path.basename(source_file))[0] + if _is_cuda_file(source_file) and with_cuda: + # Use a different object filename in case a C++ and CUDA file have + # the same filename but different extension (.cpp vs. .cu). + target = f"{file_name}.cuda.o" + else: + target = f"{file_name}.o" + return target + + objects = [object_file_path(src) for src in sources] + ldflags = ([] if is_standalone else [SHARED_FLAG]) + extra_ldflags + + ext = EXEC_EXT if is_standalone else LIB_EXT + library_target = f"{name}{ext}" + + _write_ninja_file( + path=path, + cflags=cflags, + post_cflags=None, + cuda_cflags=cuda_flags, + cuda_post_cflags=None, + cuda_dlink_post_cflags=None, + sources=sources, + objects=objects, + ldflags=ldflags, + library_target=library_target, + with_cuda=with_cuda, + ) + + +def _write_ninja_file( + path, + cflags, + post_cflags, + cuda_cflags, + cuda_post_cflags, + cuda_dlink_post_cflags, + sources, + objects, + ldflags, + library_target, + with_cuda, +) -> None: + r"""Write a ninja file that does the desired compiling and linking. + + `path`: Where to write this file + `cflags`: list of flags to pass to $cxx. Can be None. + `post_cflags`: list of flags to append to the $cxx invocation. Can be None. + `cuda_cflags`: list of flags to pass to $nvcc. Can be None. + `cuda_postflags`: list of flags to append to the $nvcc invocation. Can be None. + `sources`: list of paths to source files + `objects`: list of desired paths to objects, one per source. + `ldflags`: list of flags to pass to linker. Can be None. + `library_target`: Name of the output library. Can be None; in that case, + we do no linking. + `with_cuda`: If we should be compiling with CUDA. + """ + + def sanitize_flags(flags): + if flags is None: + return [] + else: + return [flag.strip() for flag in flags] + + cflags = sanitize_flags(cflags) + post_cflags = sanitize_flags(post_cflags) + cuda_cflags = sanitize_flags(cuda_cflags) + cuda_post_cflags = sanitize_flags(cuda_post_cflags) + cuda_dlink_post_cflags = sanitize_flags(cuda_dlink_post_cflags) + ldflags = sanitize_flags(ldflags) + + # Sanity checks... + assert len(sources) == len(objects) + assert len(sources) > 0 + + compiler = get_cxx_compiler() + + # Version 1.3 is required for the `deps` directive. + config = ["ninja_required_version = 1.3"] + config.append(f"cxx = {compiler}") + if with_cuda or cuda_dlink_post_cflags: + nvcc = executable_path("hipcc") + config.append(f"nvcc = {nvcc}") + + if IS_HIP_EXTENSION: + post_cflags = COMMON_HIP_FLAGS + post_cflags + flags = [f'cflags = {" ".join(cflags)}'] + flags.append(f'post_cflags = {" ".join(post_cflags)}') + if with_cuda: + flags.append(f'cuda_cflags = {" ".join(cuda_cflags)}') + flags.append(f'cuda_post_cflags = {" ".join(cuda_post_cflags)}') + flags.append(f'cuda_dlink_post_cflags = {" ".join(cuda_dlink_post_cflags)}') + + # Turn into absolute paths so we can emit them into the ninja build + # file wherever it is. + sources = [os.path.abspath(file) for file in sources] + + # See https://ninja-build.org/build.ninja.html for reference. + compile_rule = ["rule compile"] + compile_rule.append( + " command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags" + ) + compile_rule.append(" depfile = $out.d") + compile_rule.append(" deps = gcc") + + if with_cuda: + cuda_compile_rule = ["rule cuda_compile"] + nvcc_gendeps = "" + cuda_compile_rule.append( + f" command = $nvcc {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags" + ) + + # Emit one build rule per source to enable incremental build. + build = [] + for source_file, object_file in zip(sources, objects): + is_cuda_source = _is_cuda_file(source_file) and with_cuda + rule = "cuda_compile" if is_cuda_source else "compile" + + source_file = source_file.replace(" ", "$ ") + object_file = object_file.replace(" ", "$ ") + build.append(f"build {object_file}: {rule} {source_file}") + + flags.append(f'ldflags = {" ".join(ldflags)}') + if cuda_dlink_post_cflags: + devlink_out = os.path.join(os.path.dirname(objects[0]), "dlink.o") + devlink_rule = ["rule cuda_devlink"] + devlink_rule.append(" command = $nvcc $in -o $out $cuda_dlink_post_cflags") + devlink = [f'build {devlink_out}: cuda_devlink {" ".join(objects)}'] + objects += [devlink_out] + else: + devlink_rule, devlink = [], [] + + if library_target is not None: + link_rule = ["rule link"] + + link_rule.append( + " command = $cxx @$out.rsp $ldflags -o $out\n rspfile = $out.rsp\n rspfile_content = $in" + ) + + link = [f'build {library_target}: link {" ".join(objects)}'] + + default = [f"default {library_target}"] + else: + link_rule, link, default = [], [], [] + + # 'Blocks' should be separated by newlines, for visual benefit. + blocks = [config, flags, compile_rule] + if with_cuda: + blocks.append(cuda_compile_rule) # type: ignore[possibly-undefined] + blocks += [devlink_rule, link_rule, build, devlink, link, default] + content = "\n\n".join("\n".join(b) for b in blocks) + # Ninja requires a new lines at the end of the .ninja file + content += "\n" + _maybe_write(path, content) + + +# def _join_cuda_home(*paths) -> str: +# """ +# Join paths with CUDA_HOME, or raises an error if it CUDA_HOME is not set. + +# This is basically a lazy way of raising an error for missing $CUDA_HOME +# only once we need to get any CUDA-specific path. +# """ +# if CUDA_HOME is None: +# raise OSError('CUDA_HOME environment variable is not set. ' +# 'Please set it to your CUDA install root.') +# return os.path.join(CUDA_HOME, *paths) + + +def _is_cuda_file(path: str) -> bool: + return True + valid_ext = [".cu", ".cuh"] + if IS_HIP_EXTENSION: + valid_ext.append(".hip") + return os.path.splitext(path)[1] in valid_ext diff --git a/aiter/jit/utils/file_baton.py b/aiter/jit/utils/file_baton.py new file mode 100644 index 0000000000000000000000000000000000000000..3c1a37d64ddffb9b60f297919dbbc55ba04b003b --- /dev/null +++ b/aiter/jit/utils/file_baton.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: MIT + # mypy: allow-untyped-defs +import os +import time + + +class FileBaton: + """A primitive, file-based synchronization utility.""" + + def __init__(self, lock_file_path, wait_seconds=0.1): + """ + Create a new :class:`FileBaton`. + + Args: + lock_file_path: The path to the file used for locking. + wait_seconds: The seconds to periodically sleep (spin) when + calling ``wait()``. + """ + self.lock_file_path = lock_file_path + self.wait_seconds = wait_seconds + self.fd = None + + def try_acquire(self): + """ + Try to atomically create a file under exclusive access. + + Returns: + True if the file could be created, else False. + """ + try: + self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL) + return True + except FileExistsError: + return False + + def wait(self): + """ + Periodically sleeps for a certain amount until the baton is released. + + The amount of time slept depends on the ``wait_seconds`` parameter + passed to the constructor. + """ + while os.path.exists(self.lock_file_path): + time.sleep(self.wait_seconds) + + def release(self): + """Release the baton and removes its file.""" + if self.fd is not None: + os.close(self.fd) + + os.remove(self.lock_file_path) diff --git a/aiter/jit/utils/hipify/__init__.py b/aiter/jit/utils/hipify/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0bde01b04e250caa825555c8a1926b3fbcb23ad --- /dev/null +++ b/aiter/jit/utils/hipify/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT + \ No newline at end of file diff --git a/aiter/jit/utils/hipify/constants.py b/aiter/jit/utils/hipify/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..3604256f7dbdc4e880d16ebe4b2ce6473e3d1244 --- /dev/null +++ b/aiter/jit/utils/hipify/constants.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: MIT +"""Constants for annotations in the mapping. + +The constants defined here are used to annotate the mapping tuples in cuda_to_hip_mappings.py. +They are based on +https://github.com/ROCm/HIPIFY/blob/master/src/Statistics.h +and fall in three categories: 1) type of mapping, 2) API of mapping, 3) unsupported +mapping. +""" + +CONV_VERSION = (0,) +CONV_INIT = 1 +CONV_DEVICE = 2 +CONV_MEM = 3 +CONV_KERN = 4 +CONV_COORD_FUNC = 5 +CONV_MATH_FUNC = 6 +CONV_DEVICE_FUNC = 7 +CONV_SPECIAL_FUNC = 8 +CONV_STREAM = 9 +CONV_EVENT = 10 +CONV_OCCUPANCY = 11 +CONV_CONTEXT = 12 +CONV_PEER = 13 +CONV_MODULE = 14 +CONV_CACHE = 15 +CONV_EXEC = 16 +CONV_ERROR = 17 +CONV_DEF = 18 +CONV_TEX = 19 +CONV_GL = 20 +CONV_GRAPHICS = 21 +CONV_SURFACE = 22 +CONV_JIT = 23 +CONV_D3D9 = 24 +CONV_D3D10 = 25 +CONV_D3D11 = 26 +CONV_VDPAU = 27 +CONV_EGL = 28 +CONV_THREAD = 29 +CONV_OTHER = 30 +CONV_INCLUDE = 31 +CONV_INCLUDE_CUDA_MAIN_H = 32 +CONV_TYPE = 33 +CONV_LITERAL = 34 +CONV_NUMERIC_LITERAL = 35 +CONV_LAST = 36 + +API_DRIVER = 37 +API_RUNTIME = 38 +API_BLAS = 39 +API_SPECIAL = 40 +API_RAND = 41 +API_LAST = 42 +API_FFT = 43 +API_RTC = 44 +API_ROCTX = 45 + +HIP_UNSUPPORTED = 46 +API_PYTORCH = 1337 +API_CAFFE2 = 1338 +API_C10 = 1339 +API_ROCMSMI = 1340 diff --git a/aiter/jit/utils/hipify/cuda_to_hip_mappings.py b/aiter/jit/utils/hipify/cuda_to_hip_mappings.py new file mode 100644 index 0000000000000000000000000000000000000000..57a1fc3943eaa03459b930c9e4c85d59ae972732 --- /dev/null +++ b/aiter/jit/utils/hipify/cuda_to_hip_mappings.py @@ -0,0 +1,9524 @@ +# SPDX-License-Identifier: MIT +import collections + +from .constants import ( + API_BLAS, + API_C10, + API_CAFFE2, + API_DRIVER, + API_FFT, + API_PYTORCH, + API_RAND, + API_ROCTX, + API_RTC, + API_RUNTIME, + API_SPECIAL, + API_ROCMSMI, + CONV_CACHE, + CONV_CONTEXT, + CONV_D3D9, + CONV_D3D10, + CONV_D3D11, + CONV_DEF, + CONV_DEVICE, + CONV_DEVICE_FUNC, + CONV_EGL, + CONV_ERROR, + CONV_EVENT, + CONV_EXEC, + CONV_GL, + CONV_GRAPHICS, + CONV_INCLUDE, + CONV_INCLUDE_CUDA_MAIN_H, + CONV_INIT, + CONV_JIT, + CONV_MATH_FUNC, + CONV_MEM, + CONV_MODULE, + CONV_NUMERIC_LITERAL, + CONV_OCCUPANCY, + CONV_OTHER, + CONV_PEER, + CONV_SPECIAL_FUNC, + CONV_STREAM, + CONV_SURFACE, + CONV_TEX, + CONV_THREAD, + CONV_TYPE, + CONV_VDPAU, + CONV_VERSION, + HIP_UNSUPPORTED, +) + +""" Mapping of CUDA functions, include files, constants, and types to ROCm/HIP equivalents +This closely follows the implementation in hipify-clang +and its structure. +There are different maps for fundamental names, include files, identifies, sparse, and +PyTorch specific translations. +Each of the entries in these maps translates a CUDA string to a tuple containing the +ROCm/HIP string, a type and API annotation and - optionally - an annotation if it is not +supported in ROCm/HIP yet. +""" + +# List of math functions that should be replaced inside device code only. +MATH_TRANSPILATIONS = collections.OrderedDict( + [ + ("std::max", ("::max")), + ("std::min", ("::min")), + ("std::ceil", ("::ceil")), + ("std::floor", ("::floor")), + ("std::exp", ("::exp")), + ("std::log", ("::log")), + ("std::pow", ("::pow")), + ("std::fabs", ("::fabs")), + ("std::fmod", ("::fmod")), + ("std::remainder", ("::remainder")), + ("std::frexp", ("::frexp")), + ] +) + +CUDA_TYPE_NAME_MAP = collections.OrderedDict( + [ + ("CUresult", ("hipError_t", CONV_TYPE, API_DRIVER)), + ("cudaError_t", ("hipError_t", CONV_TYPE, API_RUNTIME)), + ("cudaError", ("hipError_t", CONV_TYPE, API_RUNTIME)), + ( + "CUDA_ARRAY3D_DESCRIPTOR", + ("HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUDA_ARRAY_DESCRIPTOR", ("HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER)), + ("CUDA_MEMCPY2D", ("hip_Memcpy2D", CONV_TYPE, API_DRIVER)), + ("CUDA_MEMCPY3D", ("HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUDA_MEMCPY3D_PEER", + ("HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_POINTER_ATTRIBUTE_P2P_TOKENS", + ( + "HIP_POINTER_ATTRIBUTE_P2P_TOKENS", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CUDA_RESOURCE_DESC", + ("HIP_RESOURCE_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_RESOURCE_VIEW_DESC", + ("HIP_RESOURCE_VIEW_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUipcEventHandle", + ("hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUipcMemHandle", ("hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ("CUaddress_mode", ("hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUarray_cubemap_face", + ("hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUarray_format", ("hipArray_format", CONV_TYPE, API_DRIVER)), + ("CUcomputemode", ("hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ("CUmem_advise", ("hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUmem_range_attribute", + ("hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUctx_flags", ("hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ("CUdevice", ("hipDevice_t", CONV_TYPE, API_DRIVER)), + ("CUdevice_attribute_enum", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)), + ("CUdevice_attribute", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)), + ("CUpointer_attribute", ("hipPointer_attribute", CONV_TYPE, API_DRIVER)), + ( + "CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL", + ("HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL", CONV_TYPE, API_DRIVER), + ), + ( + "CU_POINTER_ATTRIBUTE_BUFFER_ID", + ("HIP_POINTER_ATTRIBUTE_BUFFER_ID", CONV_TYPE, API_DRIVER), + ), + ("CUdeviceptr", ("hipDeviceptr_t", CONV_TYPE, API_DRIVER)), + ("CUarray_st", ("hipArray", CONV_TYPE, API_DRIVER)), + ("CUarray", ("hipArray *", CONV_TYPE, API_DRIVER)), + ("CUdevprop_st", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)), + ("CUdevprop", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)), + ("CUfunction", ("hipFunction_t", CONV_TYPE, API_DRIVER)), + ( + "CUgraphicsResource", + ("hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUmipmappedArray", + ("hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUfunction_attribute", + ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUfunction_attribute_enum", + ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUgraphicsMapResourceFlags", + ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUgraphicsMapResourceFlags_enum", + ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUgraphicsRegisterFlags", + ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUgraphicsRegisterFlags_enum", + ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUoccupancy_flags", + ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUoccupancy_flags_enum", + ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUfunc_cache_enum", ("hipFuncCache", CONV_TYPE, API_DRIVER)), + ("CUfunc_cache", ("hipFuncCache", CONV_TYPE, API_DRIVER)), + ("CUipcMem_flags", ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUipcMem_flags_enum", + ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUjit_cacheMode", ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUjit_cacheMode_enum", + ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUjit_fallback", ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUjit_fallback_enum", + ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUjit_option", ("hipJitOption", CONV_JIT, API_DRIVER)), + ("CUjit_option_enum", ("hipJitOption", CONV_JIT, API_DRIVER)), + ("CUjit_target", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)), + ("CUjit_target_enum", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)), + ("CUjitInputType", ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUjitInputType_enum", + ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUlimit", ("hipLimit_t", CONV_TYPE, API_DRIVER)), + ("CUlimit_enum", ("hipLimit_t", CONV_TYPE, API_DRIVER)), + ( + "CUmemAttach_flags", + ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUmemAttach_flags_enum", + ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUmemorytype", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ("CUmemorytype_enum", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ("CUresourcetype", ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUresourcetype_enum", + ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUresourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)), + ("CUresourceViewFormat_enum", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)), + ("CUsharedconfig", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)), + ("CUsharedconfig_enum", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)), + ("CUcontext", ("hipCtx_t", CONV_TYPE, API_DRIVER)), + ("CUmodule", ("hipModule_t", CONV_TYPE, API_DRIVER)), + ("CUstream", ("hipStream_t", CONV_TYPE, API_DRIVER)), + ("CUstream_st", ("ihipStream_t", CONV_TYPE, API_DRIVER)), + ("CUstreamCallback", ("hipStreamCallback_t", CONV_TYPE, API_DRIVER)), + ("CUsurfObject", ("hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUsurfref", + ("hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUtexObject", ("hipTextureObject_t", CONV_TYPE, API_DRIVER)), + ("CUtexref", ("textureReference", CONV_TYPE, API_DRIVER)), + ("CUstream_flags", ("hipStreamFlags", CONV_TYPE, API_DRIVER)), + ( + "CUstreamWaitValue_flags", + ("hipStreamWaitValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUstreamWriteValue_flags", + ("hipStreamWriteValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUstreamBatchMemOpType", + ("hipStreamBatchMemOpType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUdevice_P2PAttribute", + ("hipDeviceP2PAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUevent", ("hipEvent_t", CONV_TYPE, API_DRIVER)), + ("CUevent_st", ("ihipEvent_t", CONV_TYPE, API_DRIVER)), + ("CUevent_flags", ("hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED)), + ("CUfilter_mode", ("hipTextureFilterMode", CONV_TEX, API_DRIVER)), + ("CUGLDeviceList", ("hipGLDeviceList", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)), + ("CUGLmap_flags", ("hipGLMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUd3d9DeviceList", + ("hipD3D9DeviceList", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUd3d9map_flags", + ("hipD3D9MapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUd3d9register_flags", + ("hipD3D9RegisterFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUd3d10DeviceList", + ("hipd3d10DeviceList", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUd3d10map_flags", + ("hipD3D10MapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUd3d10register_flags", + ("hipD3D10RegisterFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUd3d11DeviceList", + ("hipd3d11DeviceList", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUeglStreamConnection_st", + ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUeglStreamConnection", + ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "libraryPropertyType_t", + ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "libraryPropertyType", + ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaStreamCallback_t", ("hipStreamCallback_t", CONV_TYPE, API_RUNTIME)), + ("cudaArray", ("hipArray", CONV_MEM, API_RUNTIME)), + ("cudaArray_t", ("hipArray_t", CONV_MEM, API_RUNTIME)), + ("cudaArray_const_t", ("hipArray_const_t", CONV_MEM, API_RUNTIME)), + ("cudaMipmappedArray_t", ("hipMipmappedArray_t", CONV_MEM, API_RUNTIME)), + ( + "cudaMipmappedArray_const_t", + ("hipMipmappedArray_const_t", CONV_MEM, API_RUNTIME), + ), + ("cudaArrayDefault", ("hipArrayDefault", CONV_MEM, API_RUNTIME)), + ("cudaArrayLayered", ("hipArrayLayered", CONV_MEM, API_RUNTIME)), + ( + "cudaArraySurfaceLoadStore", + ("hipArraySurfaceLoadStore", CONV_MEM, API_RUNTIME), + ), + ("cudaArrayCubemap", ("hipArrayCubemap", CONV_MEM, API_RUNTIME)), + ("cudaArrayTextureGather", ("hipArrayTextureGather", CONV_MEM, API_RUNTIME)), + ( + "cudaMemoryAdvise", + ("hipMemoryAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemRangeAttribute", + ("hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMemcpyKind", ("hipMemcpyKind", CONV_MEM, API_RUNTIME)), + ("cudaMemoryType", ("hipMemoryType", CONV_MEM, API_RUNTIME)), + ("cudaExtent", ("hipExtent", CONV_MEM, API_RUNTIME)), + ("cudaPitchedPtr", ("hipPitchedPtr", CONV_MEM, API_RUNTIME)), + ("cudaPos", ("hipPos", CONV_MEM, API_RUNTIME)), + ("cudaEvent_t", ("hipEvent_t", CONV_TYPE, API_RUNTIME)), + ("cudaStream_t", ("hipStream_t", CONV_TYPE, API_RUNTIME)), + ("cudaPointerAttributes", ("hipPointerAttribute_t", CONV_TYPE, API_RUNTIME)), + ("cudaDeviceAttr", ("hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME)), + ("cudaDeviceProp", ("hipDeviceProp_t", CONV_TYPE, API_RUNTIME)), + ( + "cudaDeviceP2PAttr", + ("hipDeviceP2PAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaComputeMode", + ("hipComputeMode", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaFuncCache", ("hipFuncCache_t", CONV_CACHE, API_RUNTIME)), + ( + "cudaFuncAttributes", + ("hipFuncAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaSharedMemConfig", ("hipSharedMemConfig", CONV_TYPE, API_RUNTIME)), + ("cudaLimit", ("hipLimit_t", CONV_TYPE, API_RUNTIME)), + ("cudaOutputMode", ("hipOutputMode", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)), + ("cudaTextureReadMode", ("hipTextureReadMode", CONV_TEX, API_RUNTIME)), + ("cudaTextureFilterMode", ("hipTextureFilterMode", CONV_TEX, API_RUNTIME)), + ("cudaChannelFormatKind", ("hipChannelFormatKind", CONV_TEX, API_RUNTIME)), + ("cudaChannelFormatDesc", ("hipChannelFormatDesc", CONV_TEX, API_RUNTIME)), + ("cudaResourceDesc", ("hipResourceDesc", CONV_TEX, API_RUNTIME)), + ("cudaResourceViewDesc", ("hipResourceViewDesc", CONV_TEX, API_RUNTIME)), + ("cudaTextureDesc", ("hipTextureDesc", CONV_TEX, API_RUNTIME)), + ( + "surfaceReference", + ("hipSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaTextureObject_t", ("hipTextureObject_t", CONV_TEX, API_RUNTIME)), + ("cudaResourceType", ("hipResourceType", CONV_TEX, API_RUNTIME)), + ("cudaResourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_RUNTIME)), + ("cudaTextureAddressMode", ("hipTextureAddressMode", CONV_TEX, API_RUNTIME)), + ( + "cudaSurfaceBoundaryMode", + ("hipSurfaceBoundaryMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaSurfaceFormatMode", + ("hipSurfaceFormatMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaTextureType1D", ("hipTextureType1D", CONV_TEX, API_RUNTIME)), + ("cudaTextureType2D", ("hipTextureType2D", CONV_TEX, API_RUNTIME)), + ("cudaTextureType3D", ("hipTextureType3D", CONV_TEX, API_RUNTIME)), + ("cudaTextureTypeCubemap", ("hipTextureTypeCubemap", CONV_TEX, API_RUNTIME)), + ( + "cudaTextureType1DLayered", + ("hipTextureType1DLayered", CONV_TEX, API_RUNTIME), + ), + ( + "cudaTextureType2DLayered", + ("hipTextureType2DLayered", CONV_TEX, API_RUNTIME), + ), + ( + "cudaTextureTypeCubemapLayered", + ("hipTextureTypeCubemapLayered", CONV_TEX, API_RUNTIME), + ), + ("cudaIpcEventHandle_t", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)), + ("cudaIpcEventHandle_st", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)), + ("cudaIpcMemHandle_t", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)), + ("cudaIpcMemHandle_st", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)), + ( + "cudaGraphicsCubeFace", + ("hipGraphicsCubeFace", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsMapFlags", + ("hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsRegisterFlags", + ("hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLDeviceList", + ("hipGLDeviceList", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaGLMapFlags", ("hipGLMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)), + ( + "cudaD3D9DeviceList", + ("hipD3D9DeviceList", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9MapFlags", + ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9RegisterFlags", + ("hipD3D9RegisterFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10DeviceList", + ("hipd3d10DeviceList", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10MapFlags", + ("hipD3D10MapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10RegisterFlags", + ("hipD3D10RegisterFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D11DeviceList", + ("hipd3d11DeviceList", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaEglStreamConnection", + ("hipEglStreamConnection", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cublasHandle_t", ("hipblasHandle_t", CONV_TYPE, API_BLAS)), + ("cublasOperation_t", ("hipblasOperation_t", CONV_TYPE, API_BLAS)), + ("cublasStatus_t", ("hipblasStatus_t", CONV_TYPE, API_BLAS)), + ("cublasFillMode_t", ("hipblasFillMode_t", CONV_TYPE, API_BLAS)), + ("cublasDiagType_t", ("hipblasDiagType_t", CONV_TYPE, API_BLAS)), + ("cublasSideMode_t", ("hipblasSideMode_t", CONV_TYPE, API_BLAS)), + ("cublasPointerMode_t", ("hipblasPointerMode_t", CONV_TYPE, API_BLAS)), + ("cublasGemmAlgo_t", ("hipblasGemmAlgo_t", CONV_TYPE, API_BLAS)), + ( + "cublasAtomicsMode_t", + ("hipblasAtomicsMode_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDataType_t", + ("hipblasDatatype_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED), + ), + ("curandStatus", ("hiprandStatus_t", CONV_TYPE, API_RAND)), + ("curandStatus_t", ("hiprandStatus_t", CONV_TYPE, API_RAND)), + ("curandRngType", ("hiprandRngType_t", CONV_TYPE, API_RAND)), + ("curandRngType_t", ("hiprandRngType_t", CONV_TYPE, API_RAND)), + ("curandGenerator_st", ("hiprandGenerator_st", CONV_TYPE, API_RAND)), + ("curandGenerator_t", ("hiprandGenerator_t", CONV_TYPE, API_RAND)), + ( + "curandDirectionVectorSet", + ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDirectionVectorSet_t", + ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ("curandOrdering", ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)), + ( + "curandOrdering_t", + ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDistribution_st", + ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandHistogramM2V_st", + ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDistribution_t", + ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandHistogramM2V_t", + ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDistributionShift_st", + ("hiprandDistributionShift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDistributionShift_t", + ("hiprandDistributionShift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDistributionM2Shift_st", + ("hiprandDistributionM2Shift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDistributionM2Shift_t", + ("hiprandDistributionM2Shift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandHistogramM2_st", + ("hiprandHistogramM2_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandHistogramM2_t", + ("hiprandHistogramM2_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandHistogramM2K_st", + ("hiprandHistogramM2K_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandHistogramM2K_t", + ("hiprandHistogramM2K_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandDiscreteDistribution_st", + ("hiprandDiscreteDistribution_st", CONV_TYPE, API_RAND), + ), + ( + "curandDiscreteDistribution_t", + ("hiprandDiscreteDistribution_t", CONV_TYPE, API_RAND), + ), + ("curandMethod", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)), + ("curandMethod_t", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)), + ( + "curandDirectionVectors32_t", + ("hiprandDirectionVectors32_t", CONV_TYPE, API_RAND), + ), + ( + "curandDirectionVectors64_t", + ("hiprandDirectionVectors64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ("curandStateMtgp32_t", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)), + ("curandStateMtgp32", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)), + ( + "curandStateScrambledSobol64_t", + ("hiprandStateScrambledSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandStateSobol64_t", + ("hiprandStateSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandStateScrambledSobol32_t", + ("hiprandStateScrambledSobol32_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED), + ), + ("curandStateSobol32_t", ("hiprandStateSobol32_t", CONV_TYPE, API_RAND)), + ("curandStateMRG32k3a_t", ("hiprandStateMRG32k3a_t", CONV_TYPE, API_RAND)), + ( + "curandStatePhilox4_32_10_t", + ("hiprandStatePhilox4_32_10_t", CONV_TYPE, API_RAND), + ), + ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)), + ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)), + ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)), + ("CUuuid", ("hipUUID", CONV_TYPE, API_RUNTIME)), + ("cudaGraph_t", ("hipGraph_t", CONV_TYPE, API_RAND)), + ("cudaGraphExec_t", ("hipGraphExec_t", CONV_TYPE, API_RAND)), + ("__nv_bfloat16", ("__hip_bfloat16", CONV_TYPE, API_RUNTIME)), + ("__nv_bfloat162", ("__hip_bfloat162", CONV_TYPE, API_RUNTIME)), + ] +) + +CUDA_INCLUDE_MAP = collections.OrderedDict( + [ + # since pytorch uses "\b{pattern}\b" as the actual re pattern, + # patterns listed here have to begin and end with alnum chars + ( + "include " to differentiate + ("", ("", CONV_INCLUDE, API_RUNTIME)), + ("nvrtc.h", ("hip/hiprtc.h", CONV_INCLUDE, API_RTC)), + ("thrust/system/cuda", ("thrust/system/hip", CONV_INCLUDE, API_BLAS)), + ("cub/util_allocator.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/block/block_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ( + "cub/block/block_raking_layout.cuh", + ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS), + ), + ("cub/cub.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/config.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/util_ptx.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/util_type.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ( + "cub/device/device_run_length_encode.cuh", + ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS), + ), + ("cub/block/block_load.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/block/block_store.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/block/block_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ( + "cub/device/device_radix_sort.cuh", + ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS), + ), + ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)), + ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)), + ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)), + ] +) + +CUDA_IDENTIFIER_MAP = collections.OrderedDict( + [ + ("__CUDACC__", ("__HIPCC__", CONV_DEF, API_RUNTIME)), + ( + "CUDA_ERROR_INVALID_CONTEXT", + ("hipErrorInvalidContext", CONV_TYPE, API_DRIVER), + ), + ( + "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", + ("hipErrorContextAlreadyCurrent", CONV_TYPE, API_DRIVER), + ), + ( + "CUDA_ERROR_ARRAY_IS_MAPPED", + ("hipErrorArrayIsMapped", CONV_TYPE, API_DRIVER), + ), + ("CUDA_ERROR_ALREADY_MAPPED", ("hipErrorAlreadyMapped", CONV_TYPE, API_DRIVER)), + ( + "CUDA_ERROR_ALREADY_ACQUIRED", + ("hipErrorAlreadyAcquired", CONV_TYPE, API_DRIVER), + ), + ("CUDA_ERROR_NOT_MAPPED", ("hipErrorNotMapped", CONV_TYPE, API_DRIVER)), + ( + "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", + ("hipErrorNotMappedAsArray", CONV_TYPE, API_DRIVER), + ), + ( + "CUDA_ERROR_NOT_MAPPED_AS_POINTER", + ("hipErrorNotMappedAsPointer", CONV_TYPE, API_DRIVER), + ), + ( + "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", + ("hipErrorContextAlreadyInUse", CONV_TYPE, API_DRIVER), + ), + ("CUDA_ERROR_INVALID_SOURCE", ("hipErrorInvalidSource", CONV_TYPE, API_DRIVER)), + ("CUDA_ERROR_FILE_NOT_FOUND", ("hipErrorFileNotFound", CONV_TYPE, API_DRIVER)), + ("CUDA_ERROR_NOT_FOUND", ("hipErrorNotFound", CONV_TYPE, API_DRIVER)), + ( + "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", + ( + "hipErrorLaunchIncompatibleTexturing", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", + ("hipErrorPrimaryContextActive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_CONTEXT_IS_DESTROYED", + ("hipErrorContextIsDestroyed", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_NOT_PERMITTED", + ("hipErrorNotPermitted", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_NOT_SUPPORTED", + ("hipErrorNotSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorMissingConfiguration", + ("hipErrorMissingConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorPriorLaunchFailure", + ("hipErrorPriorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidDeviceFunction", + ("hipErrorInvalidDeviceFunction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidConfiguration", + ("hipErrorInvalidConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidPitchValue", + ("hipErrorInvalidPitchValue", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidSymbol", + ("hipErrorInvalidSymbol", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidHostPointer", + ("hipErrorInvalidHostPointer", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidDevicePointer", + ("hipErrorInvalidDevicePointer", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaErrorInvalidTexture", + ("hipErrorInvalidTexture", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidTextureBinding", + ("hipErrorInvalidTextureBinding", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidChannelDescriptor", + ( + "hipErrorInvalidChannelDescriptor", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaErrorInvalidMemcpyDirection", + ("hipErrorInvalidMemcpyDirection", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorAddressOfConstant", + ("hipErrorAddressOfConstant", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorTextureFetchFailed", + ("hipErrorTextureFetchFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorTextureNotBound", + ("hipErrorTextureNotBound", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorSynchronizationError", + ("hipErrorSynchronizationError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidFilterSetting", + ("hipErrorInvalidFilterSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidNormSetting", + ("hipErrorInvalidNormSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorMixedDeviceExecution", + ("hipErrorMixedDeviceExecution", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorNotYetImplemented", + ("hipErrorNotYetImplemented", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorMemoryValueTooLarge", + ("hipErrorMemoryValueTooLarge", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInsufficientDriver", + ("hipErrorInsufficientDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorSetOnActiveProcess", + ("hipErrorSetOnActiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidSurface", + ("hipErrorInvalidSurface", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorDuplicateVariableName", + ("hipErrorDuplicateVariableName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorDuplicateTextureName", + ("hipErrorDuplicateTextureName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorDuplicateSurfaceName", + ("hipErrorDuplicateSurfaceName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorDevicesUnavailable", + ("hipErrorDevicesUnavailable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorIncompatibleDriverContext", + ( + "hipErrorIncompatibleDriverContext", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaErrorDeviceAlreadyInUse", + ("hipErrorDeviceAlreadyInUse", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorLaunchMaxDepthExceeded", + ("hipErrorLaunchMaxDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorLaunchFileScopedTex", + ("hipErrorLaunchFileScopedTex", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorLaunchFileScopedSurf", + ("hipErrorLaunchFileScopedSurf", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorSyncDepthExceeded", + ("hipErrorSyncDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorLaunchPendingCountExceeded", + ( + "hipErrorLaunchPendingCountExceeded", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaErrorNotPermitted", + ("hipErrorNotPermitted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorNotSupported", + ("hipErrorNotSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorStartupFailure", + ("hipErrorStartupFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaErrorApiFailureBase", + ("hipErrorApiFailureBase", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("CUDA_SUCCESS", ("hipSuccess", CONV_TYPE, API_DRIVER)), + ("cudaSuccess", ("hipSuccess", CONV_TYPE, API_RUNTIME)), + ("CUDA_ERROR_INVALID_VALUE", ("hipErrorInvalidValue", CONV_TYPE, API_DRIVER)), + ("cudaErrorInvalidValue", ("hipErrorInvalidValue", CONV_TYPE, API_RUNTIME)), + ( + "CUDA_ERROR_OUT_OF_MEMORY", + ("hipErrorMemoryAllocation", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorMemoryAllocation", + ("hipErrorMemoryAllocation", CONV_TYPE, API_RUNTIME), + ), + ( + "CUDA_ERROR_NOT_INITIALIZED", + ("hipErrorNotInitialized", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorInitializationError", + ("hipErrorInitializationError", CONV_TYPE, API_RUNTIME), + ), + ("CUDA_ERROR_DEINITIALIZED", ("hipErrorDeinitialized", CONV_TYPE, API_DRIVER)), + ( + "cudaErrorCudartUnloading", + ("hipErrorDeinitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_PROFILER_DISABLED", + ("hipErrorProfilerDisabled", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorProfilerDisabled", + ("hipErrorProfilerDisabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_PROFILER_NOT_INITIALIZED", + ("hipErrorProfilerNotInitialized", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorProfilerNotInitialized", + ("hipErrorProfilerNotInitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_PROFILER_ALREADY_STARTED", + ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorProfilerAlreadyStarted", + ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_PROFILER_ALREADY_STOPPED", + ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorProfilerAlreadyStopped", + ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("CUDA_ERROR_NO_DEVICE", ("hipErrorNoDevice", CONV_TYPE, API_DRIVER)), + ("cudaErrorNoDevice", ("hipErrorNoDevice", CONV_TYPE, API_RUNTIME)), + ("CUDA_ERROR_INVALID_DEVICE", ("hipErrorInvalidDevice", CONV_TYPE, API_DRIVER)), + ("cudaErrorInvalidDevice", ("hipErrorInvalidDevice", CONV_TYPE, API_RUNTIME)), + ("CUDA_ERROR_INVALID_IMAGE", ("hipErrorInvalidImage", CONV_TYPE, API_DRIVER)), + ( + "cudaErrorInvalidKernelImage", + ("hipErrorInvalidImage", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("CUDA_ERROR_MAP_FAILED", ("hipErrorMapFailed", CONV_TYPE, API_DRIVER)), + ( + "cudaErrorMapBufferObjectFailed", + ("hipErrorMapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("CUDA_ERROR_UNMAP_FAILED", ("hipErrorUnmapFailed", CONV_TYPE, API_DRIVER)), + ( + "cudaErrorUnmapBufferObjectFailed", + ("hipErrorUnmapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_NO_BINARY_FOR_GPU", + ("hipErrorNoBinaryForGpu", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorNoKernelImageForDevice", + ("hipErrorNoBinaryForGpu", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_ECC_UNCORRECTABLE", + ("hipErrorECCNotCorrectable", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorECCUncorrectable", + ("hipErrorECCNotCorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_UNSUPPORTED_LIMIT", + ("hipErrorUnsupportedLimit", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorUnsupportedLimit", + ("hipErrorUnsupportedLimit", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", + ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorPeerAccessUnsupported", + ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_INVALID_PTX", + ("hipErrorInvalidKernelFile", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorInvalidPtx", + ("hipErrorInvalidKernelFile", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT", + ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorInvalidGraphicsContext", + ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_NVLINK_UNCORRECTABLE", + ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorNvlinkUncorrectable", + ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", + ("hipErrorSharedObjectSymbolNotFound", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorSharedObjectSymbolNotFound", + ( + "hipErrorSharedObjectSymbolNotFound", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", + ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorSharedObjectInitFailed", + ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_OPERATING_SYSTEM", + ("hipErrorOperatingSystem", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorOperatingSystem", + ("hipErrorOperatingSystem", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_INVALID_HANDLE", + ("hipErrorInvalidResourceHandle", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorInvalidResourceHandle", + ("hipErrorInvalidResourceHandle", CONV_TYPE, API_RUNTIME), + ), + ("CUDA_ERROR_NOT_READY", ("hipErrorNotReady", CONV_TYPE, API_DRIVER)), + ("cudaErrorNotReady", ("hipErrorNotReady", CONV_TYPE, API_RUNTIME)), + ( + "CUDA_ERROR_ILLEGAL_ADDRESS", + ("hipErrorIllegalAddress", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorIllegalAddress", + ("hipErrorIllegalAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", + ("hipErrorLaunchOutOfResources", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorLaunchOutOfResources", + ("hipErrorLaunchOutOfResources", CONV_TYPE, API_RUNTIME), + ), + ("CUDA_ERROR_LAUNCH_TIMEOUT", ("hipErrorLaunchTimeOut", CONV_TYPE, API_DRIVER)), + ( + "cudaErrorLaunchTimeout", + ("hipErrorLaunchTimeOut", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", + ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorPeerAccessAlreadyEnabled", + ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_RUNTIME), + ), + ( + "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", + ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorPeerAccessNotEnabled", + ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_RUNTIME), + ), + ( + "CUDA_ERROR_ASSERT", + ("hipErrorAssert", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorAssert", + ("hipErrorAssert", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_TOO_MANY_PEERS", + ("hipErrorTooManyPeers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorTooManyPeers", + ("hipErrorTooManyPeers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", + ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorHostMemoryAlreadyRegistered", + ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_RUNTIME), + ), + ( + "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", + ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_DRIVER), + ), + ( + "cudaErrorHostMemoryNotRegistered", + ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_RUNTIME), + ), + ( + "CUDA_ERROR_HARDWARE_STACK_ERROR", + ("hipErrorHardwareStackError", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorHardwareStackError", + ("hipErrorHardwareStackError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_ILLEGAL_INSTRUCTION", + ("hipErrorIllegalInstruction", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorIllegalInstruction", + ("hipErrorIllegalInstruction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_MISALIGNED_ADDRESS", + ("hipErrorMisalignedAddress", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorMisalignedAddress", + ("hipErrorMisalignedAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_INVALID_ADDRESS_SPACE", + ("hipErrorInvalidAddressSpace", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidAddressSpace", + ("hipErrorInvalidAddressSpace", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_INVALID_PC", + ("hipErrorInvalidPc", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorInvalidPc", + ("hipErrorInvalidPc", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_LAUNCH_FAILED", + ("hipErrorLaunchFailure", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cudaErrorLaunchFailure", + ("hipErrorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "CUDA_ERROR_UNKNOWN", + ("hipErrorUnknown", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cudaErrorUnknown", ("hipErrorUnknown", CONV_TYPE, API_RUNTIME)), + ( + "CU_TR_ADDRESS_MODE_WRAP", + ("HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TR_ADDRESS_MODE_CLAMP", + ("HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TR_ADDRESS_MODE_MIRROR", + ("HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TR_ADDRESS_MODE_BORDER", + ("HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CUBEMAP_FACE_POSITIVE_X", + ("HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CUBEMAP_FACE_NEGATIVE_X", + ("HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CUBEMAP_FACE_POSITIVE_Y", + ("HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CUBEMAP_FACE_NEGATIVE_Y", + ("HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CUBEMAP_FACE_POSITIVE_Z", + ("HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CUBEMAP_FACE_NEGATIVE_Z", + ("HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_AD_FORMAT_UNSIGNED_INT8", + ("HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER), + ), + ( + "CU_AD_FORMAT_UNSIGNED_INT16", + ("HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER), + ), + ( + "CU_AD_FORMAT_UNSIGNED_INT32", + ("HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER), + ), + ( + "CU_AD_FORMAT_SIGNED_INT8", + ("HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER), + ), + ( + "CU_AD_FORMAT_SIGNED_INT16", + ("HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER), + ), + ( + "CU_AD_FORMAT_SIGNED_INT32", + ("HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER), + ), + ("CU_AD_FORMAT_HALF", ("HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER)), + ("CU_AD_FORMAT_FLOAT", ("HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER)), + ( + "CU_COMPUTEMODE_DEFAULT", + ("hipComputeModeDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_COMPUTEMODE_EXCLUSIVE", + ("hipComputeModeExclusive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_COMPUTEMODE_PROHIBITED", + ("hipComputeModeProhibited", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_COMPUTEMODE_EXCLUSIVE_PROCESS", + ("hipComputeModeExclusiveProcess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_ADVISE_SET_READ_MOSTLY", + ("hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_ADVISE_UNSET_READ_MOSTLY", + ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_ADVISE_SET_PREFERRED_LOCATION", + ( + "hipMemAdviseSetPreferredLocation", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", + ( + "hipMemAdviseUnsetPreferredLocation", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_MEM_ADVISE_SET_ACCESSED_BY", + ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_ADVISE_UNSET_ACCESSED_BY", + ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY", + ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION", + ( + "hipMemRangeAttributePreferredLocation", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY", + ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION", + ( + "hipMemRangeAttributeLastPrefetchLocation", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_CTX_SCHED_AUTO", + ("HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_SCHED_SPIN", + ("HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_SCHED_YIELD", + ("HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_SCHED_BLOCKING_SYNC", + ("HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_BLOCKING_SYNC", + ("HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_SCHED_MASK", + ("HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_MAP_HOST", + ("HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_LMEM_RESIZE_TO_MAX", + ("HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_CTX_FLAGS_MASK", + ("HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_LAUNCH_PARAM_BUFFER_POINTER", + ("HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_TYPE, API_DRIVER), + ), + ( + "CU_LAUNCH_PARAM_BUFFER_SIZE", + ("HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_TYPE, API_DRIVER), + ), + ("CU_LAUNCH_PARAM_END", ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER)), + ( + "CU_IPC_HANDLE_SIZE", + ("HIP_IPC_HANDLE_SIZE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMHOSTALLOC_DEVICEMAP", + ("HIP_MEMHOSTALLOC_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMHOSTALLOC_PORTABLE", + ("HIP_MEMHOSTALLOC_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMHOSTALLOC_WRITECOMBINED", + ("HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMHOSTREGISTER_DEVICEMAP", + ("HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMHOSTREGISTER_IOMEMORY", + ("HIP_MEMHOSTREGISTER_IOMEMORY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMHOSTREGISTER_PORTABLE", + ("HIP_MEMHOSTREGISTER_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_PARAM_TR_DEFAULT", + ("HIP_PARAM_TR_DEFAULT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_LEGACY", + ("HIP_STREAM_LEGACY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_PER_THREAD", + ("HIP_STREAM_PER_THREAD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TRSA_OVERRIDE_FORMAT", + ("HIP_TRSA_OVERRIDE_FORMAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TRSF_NORMALIZED_COORDINATES", + ("HIP_TRSF_NORMALIZED_COORDINATES", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TRSF_READ_AS_INTEGER", + ("HIP_TRSF_READ_AS_INTEGER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CU_TRSF_SRGB", ("HIP_TRSF_SRGB", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CUDA_ARRAY3D_2DARRAY", + ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ARRAY3D_CUBEMAP", + ("HIP_ARRAY3D_CUBEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ARRAY3D_DEPTH_TEXTURE", + ("HIP_ARRAY3D_DEPTH_TEXTURE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ARRAY3D_LAYERED", + ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ARRAY3D_SURFACE_LDST", + ("HIP_ARRAY3D_SURFACE_LDST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CUDA_ARRAY3D_TEXTURE_GATHER", + ("HIP_ARRAY3D_TEXTURE_GATHER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK", + ( + "hipDeviceAttributeMaxThreadsPerBlock", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X", + ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y", + ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z", + ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X", + ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y", + ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z", + ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK", + ( + "hipDeviceAttributeMaxSharedMemoryPerBlock", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK", + ( + "hipDeviceAttributeMaxSharedMemoryPerBlock", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY", + ( + "hipDeviceAttributeTotalConstantMemory", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_WARP_SIZE", + ("hipDeviceAttributeWarpSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_PITCH", + ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK", + ( + "hipDeviceAttributeMaxRegistersPerBlock", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK", + ( + "hipDeviceAttributeMaxRegistersPerBlock", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_CLOCK_RATE", + ("hipDeviceAttributeClockRate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT", + ( + "hipDeviceAttributeTextureAlignment", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP", + ( + "hipDeviceAttributeAsyncEngineCount", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT", + ( + "hipDeviceAttributeMultiprocessorCount", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT", + ( + "hipDeviceAttributeKernelExecTimeout", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_INTEGRATED", + ("hipDeviceAttributeIntegrated", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY", + ( + "hipDeviceAttributeCanMapHostMemory", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_COMPUTE_MODE", + ("hipDeviceAttributeComputeMode", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH", + ( + "hipDeviceAttributeMaxTexture1DWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH", + ( + "hipDeviceAttributeMaxTexture2DWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT", + ( + "hipDeviceAttributeMaxTexture2DHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH", + ( + "hipDeviceAttributeMaxTexture3DWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT", + ( + "hipDeviceAttributeMaxTexture3DHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH", + ( + "hipDeviceAttributeMaxTexture3DDepth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH", + ( + "hipDeviceAttributeMaxTexture2DLayeredWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT", + ( + "hipDeviceAttributeMaxTexture2DLayeredHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS", + ( + "hipDeviceAttributeMaxTexture2DLayeredLayers", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH", + ( + "hipDeviceAttributeMaxTexture2DLayeredWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT", + ( + "hipDeviceAttributeMaxTexture2DLayeredHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES", + ( + "hipDeviceAttributeMaxTexture2DLayeredLayers", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT", + ( + "hipDeviceAttributeSurfaceAlignment", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS", + ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_ECC_ENABLED", + ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_PCI_BUS_ID", + ("hipDeviceAttributePciBusId", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID", + ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_TCC_DRIVER", + ("hipDeviceAttributeTccDriver", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE", + ( + "hipDeviceAttributeMemoryClockRate", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH", + ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE", + ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR", + ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT", + ( + "hipDeviceAttributeAsyncEngineCount", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING", + ( + "hipDeviceAttributeUnifiedAddressing", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH", + ( + "hipDeviceAttributeMaxTexture1DLayeredWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS", + ( + "hipDeviceAttributeMaxTexture1DLayeredLayers", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER", + ( + "hipDeviceAttributeCanTex2DGather", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH", + ( + "hipDeviceAttributeMaxTexture2DGatherWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT", + ( + "hipDeviceAttributeMaxTexture2DGatherHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE", + ( + "hipDeviceAttributeMaxTexture3DWidthAlternate", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE", + ( + "hipDeviceAttributeMaxTexture3DHeightAlternate", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE", + ( + "hipDeviceAttributeMaxTexture3DDepthAlternate", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID", + ("hipDeviceAttributePciDomainId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT", + ( + "hipDeviceAttributeTexturePitchAlignment", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH", + ( + "hipDeviceAttributeMaxTextureCubemapWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH", + ( + "hipDeviceAttributeMaxTextureCubemapLayeredWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS", + ( + "hipDeviceAttributeMaxTextureCubemapLayeredLayers", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH", + ( + "hipDeviceAttributeMaxSurface1DWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH", + ( + "hipDeviceAttributeMaxSurface2DWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT", + ( + "hipDeviceAttributeMaxSurface2DHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH", + ( + "hipDeviceAttributeMaxSurface3DWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT", + ( + "hipDeviceAttributeMaxSurface3DHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH", + ( + "hipDeviceAttributeMaxSurface3DDepth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH", + ( + "hipDeviceAttributeMaxSurface1DLayeredWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS", + ( + "hipDeviceAttributeMaxSurface1DLayeredLayers", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH", + ( + "hipDeviceAttributeMaxSurface2DLayeredWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT", + ( + "hipDeviceAttributeMaxSurface2DLayeredHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS", + ( + "hipDeviceAttributeMaxSurface2DLayeredLayers", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH", + ( + "hipDeviceAttributeMaxSurfaceCubemapWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH", + ( + "hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS", + ( + "hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH", + ( + "hipDeviceAttributeMaxTexture1DLinearWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH", + ( + "hipDeviceAttributeMaxTexture2DLinearWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT", + ( + "hipDeviceAttributeMaxTexture2DLinearHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH", + ( + "hipDeviceAttributeMaxTexture2DLinearPitch", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH", + ( + "hipDeviceAttributeMaxTexture2DMipmappedWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT", + ( + "hipDeviceAttributeMaxTexture2DMipmappedHeight", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR", + ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR", + ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH", + ( + "hipDeviceAttributeMaxTexture1DMipmappedWidth", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED", + ( + "hipDeviceAttributeStreamPrioritiesSupported", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED", + ( + "hipDeviceAttributeGlobalL1CacheSupported", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED", + ( + "hipDeviceAttributeLocalL1CacheSupported", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR", + ( + "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", + CONV_TYPE, + API_DRIVER, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR", + ( + "hipDeviceAttributeMaxRegistersPerMultiprocessor", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY", + ("hipDeviceAttributeManagedMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD", + ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_DRIVER), + ), + ( + "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID", + ( + "hipDeviceAttributeMultiGpuBoardGroupId", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED", + ( + "hipDeviceAttributeHostNativeAtomicSupported", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO", + ( + "hipDeviceAttributeSingleToDoublePrecisionPerfRatio", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS", + ( + "hipDeviceAttributePageableMemoryAccess", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS", + ( + "hipDeviceAttributeConcurrentManagedAccess", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED", + ( + "hipDeviceAttributeComputePreemptionSupported", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM", + ( + "hipDeviceAttributeCanUseHostPointerForRegisteredMem", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_ATTRIBUTE_MAX", + ("hipDeviceAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_POINTER_ATTRIBUTE_CONTEXT", + ("hipPointerAttributeContext", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_POINTER_ATTRIBUTE_MEMORY_TYPE", + ("hipPointerAttributeMemoryType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_POINTER_ATTRIBUTE_DEVICE_POINTER", + ( + "hipPointerAttributeDevicePointer", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_POINTER_ATTRIBUTE_HOST_POINTER", + ("hipPointerAttributeHostPointer", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_POINTER_ATTRIBUTE_P2P_TOKENS", + ("hipPointerAttributeP2pTokens", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_POINTER_ATTRIBUTE_SYNC_MEMOPS", + ("hipPointerAttributeSyncMemops", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_POINTER_ATTRIBUTE_BUFFER_ID", + ("hipPointerAttributeBufferId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_POINTER_ATTRIBUTE_IS_MANAGED", + ("hipPointerAttributeIsManaged", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK", + ( + "hipFuncAttributeMaxThreadsPerBlocks", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES", + ("hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES", + ("hipFuncAttributeMaxDynamicSharedMemorySize", CONV_TYPE, API_RUNTIME), + ), + ( + "CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES", + ("hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES", + ("hipFuncAttributeLocalSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_NUM_REGS", + ("hipFuncAttributeNumRegs", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_PTX_VERSION", + ("hipFuncAttributePtxVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_BINARY_VERSION", + ("hipFuncAttributeBinaryVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_CACHE_MODE_CA", + ("hipFuncAttributeCacheModeCA", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_FUNC_ATTRIBUTE_MAX", + ("hipFuncAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE", + ("hipGraphicsMapFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY", + ("hipGraphicsMapFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD", + ("hipGraphicsMapFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GRAPHICS_REGISTER_FLAGS_NONE", + ("hipGraphicsRegisterFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY", + ( + "hipGraphicsRegisterFlagsReadOnly", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD", + ( + "hipGraphicsRegisterFlagsWriteDiscard", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST", + ( + "hipGraphicsRegisterFlagsSurfaceLoadStore", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER", + ( + "hipGraphicsRegisterFlagsTextureGather", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_OCCUPANCY_DEFAULT", + ("hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE", + ( + "hipOccupancyDisableCachingOverride", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_FUNC_CACHE_PREFER_NONE", + ("hipFuncCachePreferNone", CONV_CACHE, API_DRIVER), + ), + ( + "CU_FUNC_CACHE_PREFER_SHARED", + ("hipFuncCachePreferShared", CONV_CACHE, API_DRIVER), + ), + ("CU_FUNC_CACHE_PREFER_L1", ("hipFuncCachePreferL1", CONV_CACHE, API_DRIVER)), + ( + "CU_FUNC_CACHE_PREFER_EQUAL", + ("hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER), + ), + ( + "CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS", + ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CUDA_IPC_HANDLE_SIZE", ("HIP_IPC_HANDLE_SIZE", CONV_TYPE, API_DRIVER)), + ( + "CU_JIT_CACHE_OPTION_NONE", + ("hipJitCacheModeOptionNone", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_CACHE_OPTION_CG", + ("hipJitCacheModeOptionCG", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_CACHE_OPTION_CA", + ("hipJitCacheModeOptionCA", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_PREFER_PTX", + ("hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_PREFER_BINARY", + ("hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CU_JIT_MAX_REGISTERS", ("hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER)), + ( + "CU_JIT_THREADS_PER_BLOCK", + ("hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER), + ), + ("CU_JIT_WALL_TIME", ("hipJitOptionWallTime", CONV_JIT, API_DRIVER)), + ("CU_JIT_INFO_LOG_BUFFER", ("hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER)), + ( + "CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES", + ("hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER), + ), + ( + "CU_JIT_ERROR_LOG_BUFFER", + ("hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER), + ), + ( + "CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES", + ("hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER), + ), + ( + "CU_JIT_OPTIMIZATION_LEVEL", + ("hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER), + ), + ( + "CU_JIT_TARGET_FROM_CUCONTEXT", + ("hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER), + ), + ("CU_JIT_TARGET", ("hipJitOptionTarget", CONV_JIT, API_DRIVER)), + ( + "CU_JIT_FALLBACK_STRATEGY", + ("hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER), + ), + ( + "CU_JIT_GENERATE_DEBUG_INFO", + ("hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER), + ), + ("CU_JIT_LOG_VERBOSE", ("hipJitOptionLogVerbose", CONV_JIT, API_DRIVER)), + ( + "CU_JIT_GENERATE_LINE_INFO", + ("hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER), + ), + ("CU_JIT_CACHE_MODE", ("hipJitOptionCacheMode", CONV_JIT, API_DRIVER)), + ("CU_JIT_NEW_SM3X_OPT", ("hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER)), + ("CU_JIT_FAST_COMPILE", ("hipJitOptionFastCompile", CONV_JIT, API_DRIVER)), + ("CU_JIT_NUM_OPTIONS", ("hipJitOptionNumOptions", CONV_JIT, API_DRIVER)), + ( + "CU_TARGET_COMPUTE_10", + ("hipJitTargetCompute10", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_11", + ("hipJitTargetCompute11", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_12", + ("hipJitTargetCompute12", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_13", + ("hipJitTargetCompute13", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_20", + ("hipJitTargetCompute20", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_21", + ("hipJitTargetCompute21", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_30", + ("hipJitTargetCompute30", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_32", + ("hipJitTargetCompute32", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_35", + ("hipJitTargetCompute35", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_37", + ("hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_50", + ("hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_52", + ("hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_53", + ("hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_60", + ("hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_61", + ("hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_TARGET_COMPUTE_62", + ("hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_INPUT_CUBIN", + ("hipJitInputTypeBin", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_INPUT_PTX", + ("hipJitInputTypePtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_INPUT_FATBINARY", + ("hipJitInputTypeFatBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_INPUT_OBJECT", + ("hipJitInputTypeObject", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_INPUT_LIBRARY", + ("hipJitInputTypeLibrary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_JIT_NUM_INPUT_TYPES", + ("hipJitInputTypeNumInputTypes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_LIMIT_STACK_SIZE", + ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_LIMIT_PRINTF_FIFO_SIZE", + ("hipLimitPrintfFifoSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_LIMIT_MALLOC_HEAP_SIZE", + ("hipLimitMallocHeapSize", CONV_TYPE, API_DRIVER), + ), + ( + "CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH", + ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT", + ( + "hipLimitDevRuntimePendingLaunchCount", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_LIMIT_STACK_SIZE", + ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_ATTACH_GLOBAL", + ("hipMemAttachGlobal", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_ATTACH_HOST", + ("hipMemAttachHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEM_ATTACH_SINGLE", + ("hipMemAttachSingle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMORYTYPE_HOST", + ("hipMemTypeHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMORYTYPE_DEVICE", + ("hipMemTypeDevice", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMORYTYPE_ARRAY", + ("hipMemTypeArray", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_MEMORYTYPE_UNIFIED", + ("hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_RESOURCE_TYPE_ARRAY", + ("hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_RESOURCE_TYPE_MIPMAPPED_ARRAY", + ("hipResourceTypeMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_RESOURCE_TYPE_LINEAR", + ("hipResourceTypeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_RESOURCE_TYPE_PITCH2D", + ("hipResourceTypePitch2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CU_RES_VIEW_FORMAT_NONE", ("hipResViewFormatNone", CONV_TEX, API_DRIVER)), + ( + "CU_RES_VIEW_FORMAT_UINT_1X8", + ("hipResViewFormatUnsignedChar1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_2X8", + ("hipResViewFormatUnsignedChar2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_4X8", + ("hipResViewFormatUnsignedChar4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_1X8", + ("hipResViewFormatSignedChar1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_2X8", + ("hipResViewFormatSignedChar2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_4X8", + ("hipResViewFormatSignedChar4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_1X16", + ("hipResViewFormatUnsignedShort1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_2X16", + ("hipResViewFormatUnsignedShort2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_4X16", + ("hipResViewFormatUnsignedShort4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_1X16", + ("hipResViewFormatSignedShort1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_2X16", + ("hipResViewFormatSignedShort2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_4X16", + ("hipResViewFormatSignedShort4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_1X32", + ("hipResViewFormatUnsignedInt1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_2X32", + ("hipResViewFormatUnsignedInt2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UINT_4X32", + ("hipResViewFormatUnsignedInt4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_1X32", + ("hipResViewFormatSignedInt1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_2X32", + ("hipResViewFormatSignedInt2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SINT_4X32", + ("hipResViewFormatSignedInt4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_FLOAT_1X16", + ("hipResViewFormatHalf1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_FLOAT_2X16", + ("hipResViewFormatHalf2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_FLOAT_4X16", + ("hipResViewFormatHalf4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_FLOAT_1X32", + ("hipResViewFormatFloat1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_FLOAT_2X32", + ("hipResViewFormatFloat2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_FLOAT_4X32", + ("hipResViewFormatFloat4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UNSIGNED_BC1", + ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UNSIGNED_BC2", + ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UNSIGNED_BC3", + ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UNSIGNED_BC4", + ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SIGNED_BC4", + ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UNSIGNED_BC5", + ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SIGNED_BC5", + ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UNSIGNED_BC6H", + ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_SIGNED_BC6H", + ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER), + ), + ( + "CU_RES_VIEW_FORMAT_UNSIGNED_BC7", + ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER), + ), + ( + "CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE", + ("hipSharedMemBankSizeDefault", CONV_TYPE, API_DRIVER), + ), + ( + "CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE", + ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_DRIVER), + ), + ( + "CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE", + ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_DRIVER), + ), + ("CU_STREAM_DEFAULT", ("hipStreamDefault", CONV_TYPE, API_DRIVER)), + ("CU_STREAM_NON_BLOCKING", ("hipStreamNonBlocking", CONV_TYPE, API_DRIVER)), + ( + "CU_STREAM_WAIT_VALUE_GEQ", + ("hipStreamWaitValueGeq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_WAIT_VALUE_EQ", + ("hipStreamWaitValueEq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_WAIT_VALUE_AND", + ("hipStreamWaitValueAnd", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_WAIT_VALUE_FLUSH", + ("hipStreamWaitValueFlush", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_WRITE_VALUE_DEFAULT", + ("hipStreamWriteValueDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER", + ( + "hipStreamWriteValueNoMemoryBarrier", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_STREAM_MEM_OP_WAIT_VALUE_32", + ("hipStreamBatchMemOpWaitValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_MEM_OP_WRITE_VALUE_32", + ("hipStreamBatchMemOpWriteValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES", + ( + "hipStreamBatchMemOpFlushRemoteWrites", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuGetErrorName", + ("hipGetErrorName", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGetErrorString", + ("hipDrvGetErrorString", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuInit", ("hipInit", CONV_INIT, API_DRIVER)), + ("cuDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_DRIVER)), + ("cuCtxCreate", ("hipCtxCreate", CONV_CONTEXT, API_DRIVER)), + ("cuCtxCreate_v2", ("hipCtxCreate", CONV_CONTEXT, API_DRIVER)), + ("cuCtxDestroy", ("hipCtxDestroy", CONV_CONTEXT, API_DRIVER)), + ("cuCtxDestroy_v2", ("hipCtxDestroy", CONV_CONTEXT, API_DRIVER)), + ("cuCtxGetApiVersion", ("hipCtxGetApiVersion", CONV_CONTEXT, API_DRIVER)), + ("cuCtxGetCacheConfig", ("hipCtxGetCacheConfig", CONV_CONTEXT, API_DRIVER)), + ("cuCtxGetCurrent", ("hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER)), + ("cuCtxGetDevice", ("hipCtxGetDevice", CONV_CONTEXT, API_DRIVER)), + ("cuCtxGetFlags", ("hipCtxGetFlags", CONV_CONTEXT, API_DRIVER)), + ("cuDeviceGetUuid", ("hipDeviceGetUuid", CONV_CONTEXT, API_DRIVER)), + ( + "cuCtxGetLimit", + ("hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuCtxGetSharedMemConfig", + ("hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER), + ), + ( + "cuCtxGetStreamPriorityRange", + ("hipCtxGetStreamPriorityRange", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuCtxPopCurrent_v2", ("hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER)), + ("cuCtxPushCurrent_v2", ("hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER)), + ("cuCtxSetCacheConfig", ("hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER)), + ("cuCtxSetCurrent", ("hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER)), + ( + "cuCtxSetLimit", + ("hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuCtxSetSharedMemConfig", + ("hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER), + ), + ("cuCtxSynchronize", ("hipCtxSynchronize", CONV_CONTEXT, API_DRIVER)), + ("cuCtxAttach", ("hipCtxAttach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)), + ("cuCtxDetach", ("hipCtxDetach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)), + ("cuCtxEnablePeerAccess", ("hipCtxEnablePeerAccess", CONV_PEER, API_DRIVER)), + ("cuCtxDisablePeerAccess", ("hipCtxDisablePeerAccess", CONV_PEER, API_DRIVER)), + ("cuDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_DRIVER)), + ( + "cuDeviceGetP2PAttribute", + ("hipDeviceGetP2PAttribute", CONV_PEER, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuDevicePrimaryCtxGetState", + ("hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER), + ), + ( + "cuDevicePrimaryCtxRelease", + ("hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER), + ), + ( + "cuDevicePrimaryCtxReset", + ("hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER), + ), + ( + "cuDevicePrimaryCtxRetain", + ("hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER), + ), + ( + "cuDevicePrimaryCtxSetFlags", + ("hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER), + ), + ("cuDeviceGet", ("hipDeviceGet", CONV_DEVICE, API_DRIVER)), + ("cuDeviceGetName", ("hipDeviceGetName", CONV_DEVICE, API_DRIVER)), + ("cuDeviceGetCount", ("hipGetDeviceCount", CONV_DEVICE, API_DRIVER)), + ("cuDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_DRIVER)), + ("cuDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_DRIVER)), + ("cuDeviceGetByPCIBusId", ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_DRIVER)), + ("cuDeviceTotalMem_v2", ("hipDeviceTotalMem", CONV_DEVICE, API_DRIVER)), + ( + "cuDeviceComputeCapability", + ("hipDeviceComputeCapability", CONV_DEVICE, API_DRIVER), + ), + ("cuDeviceGetProperties", ("hipGetDeviceProperties", CONV_DEVICE, API_DRIVER)), + ("cuLinkAddData", ("hipLinkAddData", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ("cuLinkAddFile", ("hipLinkAddFile", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuLinkComplete", + ("hipLinkComplete", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuLinkCreate", ("hipLinkCreate", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ("cuLinkDestroy", ("hipLinkDestroy", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ("cuModuleGetFunction", ("hipModuleGetFunction", CONV_MODULE, API_DRIVER)), + ("cuModuleGetGlobal_v2", ("hipModuleGetGlobal", CONV_MODULE, API_DRIVER)), + ( + "cuModuleGetSurfRef", + ("hipModuleGetSurfRef", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuModuleGetTexRef", ("hipModuleGetTexRef", CONV_MODULE, API_DRIVER)), + ("cuModuleLoad", ("hipModuleLoad", CONV_MODULE, API_DRIVER)), + ("cuModuleLoadData", ("hipModuleLoadData", CONV_MODULE, API_DRIVER)), + ("cuModuleLoadDataEx", ("hipModuleLoadDataEx", CONV_MODULE, API_DRIVER)), + ( + "cuModuleLoadFatBinary", + ("hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuModuleUnload", ("hipModuleUnload", CONV_MODULE, API_DRIVER)), + ( + "CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK", + ( + "hipDeviceP2PAttributePerformanceRank", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED", + ( + "hipDeviceP2PAttributeAccessSupported", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED", + ( + "hipDeviceP2PAttributeNativeAtomicSupported", + CONV_TYPE, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ("CU_EVENT_DEFAULT", ("hipEventDefault", CONV_EVENT, API_DRIVER)), + ("CU_EVENT_BLOCKING_SYNC", ("hipEventBlockingSync", CONV_EVENT, API_DRIVER)), + ("CU_EVENT_DISABLE_TIMING", ("hipEventDisableTiming", CONV_EVENT, API_DRIVER)), + ("CU_EVENT_INTERPROCESS", ("hipEventInterprocess", CONV_EVENT, API_DRIVER)), + ("cuEventCreate", ("hipEventCreate", CONV_EVENT, API_DRIVER)), + ("cuEventDestroy", ("hipEventDestroy", CONV_EVENT, API_DRIVER)), + ("cuEventDestroy_v2", ("hipEventDestroy", CONV_EVENT, API_DRIVER)), + ("cuEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_DRIVER)), + ("cuEventQuery", ("hipEventQuery", CONV_EVENT, API_DRIVER)), + ("cuEventRecord", ("hipEventRecord", CONV_EVENT, API_DRIVER)), + ("cuEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_DRIVER)), + ("cuFuncSetAttribute", ("hipFuncSetAttribute", CONV_EVENT, API_DRIVER)), + ( + "cuFuncGetAttribute", + ("hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER)), + ( + "cuFuncSetSharedMemConfig", + ("hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuLaunchKernel", ("hipModuleLaunchKernel", CONV_MODULE, API_DRIVER)), + ( + "cuFuncSetBlockShape", + ("hipFuncSetBlockShape", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuFuncSetSharedSize", + ("hipFuncSetSharedSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuLaunch", ("hipLaunch", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ("cuLaunchGrid", ("hipLaunchGrid", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuLaunchGridAsync", + ("hipLaunchGridAsync", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuParamSetf", ("hipParamSetf", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ("cuParamSeti", ("hipParamSeti", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuParamSetSize", + ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuParamSetSize", + ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuParamSetv", ("hipParamSetv", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuOccupancyMaxActiveBlocksPerMultiprocessor", + ( + "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor", + CONV_OCCUPANCY, + API_DRIVER, + ), + ), + ( + "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + ( + "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + CONV_OCCUPANCY, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuOccupancyMaxPotentialBlockSize", + ("hipModuleOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER), + ), + ( + "cuOccupancyMaxPotentialBlockSizeWithFlags", + ( + "hipModuleOccupancyMaxPotentialBlockSizeWithFlags", + CONV_OCCUPANCY, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ("cuStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_DRIVER)), + ( + "cuStreamAttachMemAsync", + ("hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuStreamCreate", + ("hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuStreamCreateWithPriority", + ("hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuStreamDestroy", ("hipStreamDestroy", CONV_STREAM, API_DRIVER)), + ("cuStreamDestroy_v2", ("hipStreamDestroy", CONV_STREAM, API_DRIVER)), + ("cuStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_DRIVER)), + ( + "cuStreamGetPriority", + ("hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuStreamQuery", ("hipStreamQuery", CONV_STREAM, API_DRIVER)), + ("cuStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_DRIVER)), + ("cuStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_DRIVER)), + ( + "cuStreamWaitValue32", + ("hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuStreamWriteValue32", + ("hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuStreamBatchMemOp", + ("hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuArray3DCreate", ("hipArray3DCreate", CONV_MEM, API_DRIVER)), + ( + "cuArray3DGetDescriptor", + ("hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuArrayCreate", ("hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ("cuArrayDestroy", ("hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuArrayGetDescriptor", + ("hipArrayGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuIpcCloseMemHandle", + ("hipIpcCloseMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuIpcGetEventHandle", + ("hipIpcGetEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuIpcGetMemHandle", + ("hipIpcGetMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuIpcOpenEventHandle", + ("hipIpcOpenEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuIpcOpenMemHandle", + ("hipIpcOpenMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemAlloc_v2", ("hipMalloc", CONV_MEM, API_DRIVER)), + ("cuMemAllocHost", ("hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemAllocManaged", + ("hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemAllocPitch", + ("hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemcpy", ("hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ("cuMemcpy2D", ("hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemcpy2DAsync", + ("hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemcpy2DUnaligned", + ("hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemcpy3D", ("hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemcpy3DAsync", + ("hipMemcpy3DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemcpy3DPeer", + ("hipMemcpy3DPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemcpy3DPeerAsync", + ("hipMemcpy3DPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemcpyAsync", ("hipMemcpyAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ("cuMemcpyAtoA", ("hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ("cuMemcpyAtoD", ("hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ("cuMemcpyAtoH", ("hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemcpyAtoHAsync", + ("hipMemcpyAtoHAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemcpyDtoA", ("hipMemcpyDtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ("cuMemcpyDtoD_v2", ("hipMemcpyDtoD", CONV_MEM, API_DRIVER)), + ("cuMemcpyDtoDAsync_v2", ("hipMemcpyDtoDAsync", CONV_MEM, API_DRIVER)), + ("cuMemcpyDtoH_v2", ("hipMemcpyDtoH", CONV_MEM, API_DRIVER)), + ("cuMemcpyDtoHAsync_v2", ("hipMemcpyDtoHAsync", CONV_MEM, API_DRIVER)), + ("cuMemcpyHtoA", ("hipMemcpyHtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemcpyHtoAAsync", + ("hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemcpyHtoD_v2", ("hipMemcpyHtoD", CONV_MEM, API_DRIVER)), + ("cuMemcpyHtoDAsync_v2", ("hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER)), + ( + "cuMemcpyPeerAsync", + ("hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemcpyPeer", ("hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ("cuMemFree", ("hipFree", CONV_MEM, API_DRIVER)), + ("cuMemFree_v2", ("hipFree", CONV_MEM, API_DRIVER)), + ("cuMemFreeHost", ("hipHostFree", CONV_MEM, API_DRIVER)), + ( + "cuMemGetAddressRange", + ("hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemGetInfo_v2", ("hipMemGetInfo", CONV_MEM, API_DRIVER)), + ("cuMemHostAlloc", ("hipHostMalloc", CONV_MEM, API_DRIVER)), + ( + "cuMemHostGetDevicePointer", + ("hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemHostGetFlags", + ("hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemHostRegister_v2", ("hipHostRegister", CONV_MEM, API_DRIVER)), + ("cuMemHostUnregister", ("hipHostUnregister", CONV_MEM, API_DRIVER)), + ("cuMemsetD16_v2", ("hipMemsetD16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemsetD16Async", + ("hipMemsetD16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemsetD2D16_v2", ("hipMemsetD2D16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemsetD2D16Async", + ("hipMemsetD2D16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemsetD2D32_v2", ("hipMemsetD2D32", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemsetD2D32Async", + ("hipMemsetD2D32Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemsetD2D8_v2", ("hipMemsetD2D8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemsetD2D8Async", + ("hipMemsetD2D8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemsetD32_v2", ("hipMemset", CONV_MEM, API_DRIVER)), + ("cuMemsetD32Async", ("hipMemsetAsync", CONV_MEM, API_DRIVER)), + ("cuMemsetD8_v2", ("hipMemsetD8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemsetD8Async", + ("hipMemsetD8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMipmappedArrayCreate", + ("hipMipmappedArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMipmappedArrayDestroy", + ("hipMipmappedArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMipmappedArrayGetLevel", + ("hipMipmappedArrayGetLevel", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemPrefetchAsync", + ("hipMemPrefetchAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuMemAdvise", ("hipMemAdvise", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuMemRangeGetAttribute", + ("hipMemRangeGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemRangeGetAttributes", + ("hipMemRangeGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuPointerGetAttribute", + ("hipPointerGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuMemGetAddressRange_v2", + ("hipMemGetAddressRange", CONV_MEM, API_DRIVER), + ), + ( + "cuPointerGetAttributes", + ("hipPointerGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuPointerSetAttribute", + ("hipPointerSetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED), + ), + ("CU_TR_FILTER_MODE_POINT", ("hipFilterModePoint", CONV_TEX, API_DRIVER)), + ( + "CU_TR_FILTER_MODE_LINEAR", + ("hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetAddress", + ("hipTexRefGetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetAddressMode", + ("hipTexRefGetAddressMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetArray", + ("hipTexRefGetArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetBorderColor", + ("hipTexRefGetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetFilterMode", + ("hipTexRefGetFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetFlags", + ("hipTexRefGetFlags", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetFormat", + ("hipTexRefGetFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetMaxAnisotropy", + ("hipTexRefGetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetMipmapFilterMode", + ("hipTexRefGetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetMipmapLevelBias", + ("hipTexRefGetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetMipmapLevelClamp", + ("hipTexRefGetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefGetMipmappedArray", + ("hipTexRefGetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefSetAddress", + ("hipTexRefSetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefSetAddress2D", + ("hipTexRefSetAddress2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuTexRefSetAddressMode", ("hipTexRefSetAddressMode", CONV_TEX, API_DRIVER)), + ("cuTexRefSetArray", ("hipTexRefSetArray", CONV_TEX, API_DRIVER)), + ( + "cuTexRefSetBorderColor", + ("hipTexRefSetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuTexRefSetFilterMode", ("hipTexRefSetFilterMode", CONV_TEX, API_DRIVER)), + ("cuTexRefSetFlags", ("hipTexRefSetFlags", CONV_TEX, API_DRIVER)), + ("cuTexRefSetFormat", ("hipTexRefSetFormat", CONV_TEX, API_DRIVER)), + ( + "cuTexRefSetMaxAnisotropy", + ("hipTexRefSetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefSetMipmapFilterMode", + ("hipTexRefSetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefSetMipmapLevelBias", + ("hipTexRefSetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefSetMipmapLevelClamp", + ("hipTexRefSetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexRefSetMipmappedArray", + ("hipTexRefSetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuTexRefCreate", ("hipTexRefCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuTexRefDestroy", + ("hipTexRefDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuSurfRefGetArray", + ("hipSurfRefGetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuSurfRefSetArray", + ("hipSurfRefSetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexObjectCreate", + ("hipTexObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexObjectDestroy", + ("hipTexObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexObjectGetResourceDesc", + ("hipTexObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexObjectGetResourceViewDesc", + ("hipTexObjectGetResourceViewDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuTexObjectGetTextureDesc", + ("hipTexObjectGetTextureDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuSurfObjectCreate", + ("hipSurfObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuSurfObjectDestroy", + ("hipSurfObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuSurfObjectGetResourceDesc", + ("hipSurfObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsMapResources", + ("hipGraphicsMapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsResourceGetMappedMipmappedArray", + ( + "hipGraphicsResourceGetMappedMipmappedArray", + CONV_GRAPHICS, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuGraphicsResourceGetMappedPointer", + ( + "hipGraphicsResourceGetMappedPointer", + CONV_GRAPHICS, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuGraphicsResourceSetMapFlags", + ( + "hipGraphicsResourceSetMapFlags", + CONV_GRAPHICS, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuGraphicsSubResourceGetMappedArray", + ( + "hipGraphicsSubResourceGetMappedArray", + CONV_GRAPHICS, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuGraphicsUnmapResources", + ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsUnregisterResource", + ( + "hipGraphicsUnregisterResource", + CONV_GRAPHICS, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuProfilerInitialize", + ("hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuProfilerStart", ("hipProfilerStart", CONV_OTHER, API_DRIVER)), + ("cuProfilerStop", ("hipProfilerStop", CONV_OTHER, API_DRIVER)), + ( + "CU_GL_DEVICE_LIST_ALL", + ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GL_DEVICE_LIST_CURRENT_FRAME", + ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GL_DEVICE_LIST_NEXT_FRAME", + ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuGLGetDevices", ("hipGLGetDevices", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuGraphicsGLRegisterBuffer", + ("hipGraphicsGLRegisterBuffer", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsGLRegisterImage", + ("hipGraphicsGLRegisterImage", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ("cuWGLGetDevice", ("hipWGLGetDevice", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)), + ( + "CU_GL_MAP_RESOURCE_FLAGS_NONE", + ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY", + ( + "HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY", + CONV_GL, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", + ( + "HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", + CONV_GL, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ("cuGLCtxCreate", ("hipGLCtxCreate", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)), + ("cuGLInit", ("hipGLInit", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)), + ( + "cuGLMapBufferObject", + ("hipGLMapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGLMapBufferObjectAsync", + ("hipGLMapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGLRegisterBufferObject", + ("hipGLRegisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGLSetBufferObjectMapFlags", + ("hipGLSetBufferObjectMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGLUnmapBufferObject", + ("hipGLUnmapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGLUnmapBufferObjectAsync", + ("hipGLUnmapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGLUnregisterBufferObject", + ("hipGLUnregisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D9_DEVICE_LIST_ALL", + ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D9_DEVICE_LIST_CURRENT_FRAME", + ( + "HIP_D3D9_DEVICE_LIST_CURRENT_FRAME", + CONV_D3D9, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D9_DEVICE_LIST_NEXT_FRAME", + ("HIP_D3D9_DEVICE_LIST_NEXT_FRAME", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9CtxCreate", + ("hipD3D9CtxCreate", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9CtxCreateOnDevice", + ("hipD3D9CtxCreateOnDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9GetDevice", + ("hipD3D9GetDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9GetDevices", + ("hipD3D9GetDevices", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9GetDirect3DDevice", + ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsD3D9RegisterResource", + ("hipGraphicsD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D9_MAPRESOURCE_FLAGS_NONE", + ("HIP_D3D9_MAPRESOURCE_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D9_MAPRESOURCE_FLAGS_READONLY", + ( + "HIP_D3D9_MAPRESOURCE_FLAGS_READONLY", + CONV_D3D9, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", + ( + "HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", + CONV_D3D9, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D9_REGISTER_FLAGS_NONE", + ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D9_REGISTER_FLAGS_ARRAY", + ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9MapResources", + ("hipD3D9MapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9RegisterResource", + ("hipD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9ResourceGetMappedArray", + ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9ResourceGetMappedPitch", + ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9ResourceGetMappedPointer", + ("hipD3D9ResourceGetMappedPointer", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9ResourceGetMappedSize", + ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9ResourceGetSurfaceDimensions", + ( + "hipD3D9ResourceGetSurfaceDimensions", + CONV_D3D9, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuD3D9ResourceSetMapFlags", + ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9UnmapResources", + ("hipD3D9UnmapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D9UnregisterResource", + ("hipD3D9UnregisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D10_DEVICE_LIST_ALL", + ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D10_DEVICE_LIST_CURRENT_FRAME", + ( + "HIP_D3D10_DEVICE_LIST_CURRENT_FRAME", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D10_DEVICE_LIST_NEXT_FRAME", + ( + "HIP_D3D10_DEVICE_LIST_NEXT_FRAME", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuD3D10GetDevice", + ("hipD3D10GetDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10GetDevices", + ("hipD3D10GetDevices", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsD3D10RegisterResource", + ( + "hipGraphicsD3D10RegisterResource", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D10_MAPRESOURCE_FLAGS_NONE", + ( + "HIP_D3D10_MAPRESOURCE_FLAGS_NONE", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D10_MAPRESOURCE_FLAGS_READONLY", + ( + "HIP_D3D10_MAPRESOURCE_FLAGS_READONLY", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", + ( + "HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D10_REGISTER_FLAGS_NONE", + ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D10_REGISTER_FLAGS_ARRAY", + ("HIP_D3D10_REGISTER_FLAGS_ARRAY", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10CtxCreate", + ("hipD3D10CtxCreate", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10CtxCreateOnDevice", + ("hipD3D10CtxCreateOnDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10GetDirect3DDevice", + ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10MapResources", + ("hipD3D10MapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10RegisterResource", + ("hipD3D10RegisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10ResourceGetMappedArray", + ("hipD3D10ResourceGetMappedArray", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10ResourceGetMappedPitch", + ("hipD3D10ResourceGetMappedPitch", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10ResourceGetMappedPointer", + ( + "hipD3D10ResourceGetMappedPointer", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuD3D10ResourceGetMappedSize", + ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10ResourceGetSurfaceDimensions", + ( + "hipD3D10ResourceGetSurfaceDimensions", + CONV_D3D10, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuD310ResourceSetMapFlags", + ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10UnmapResources", + ("hipD3D10UnmapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D10UnregisterResource", + ("hipD3D10UnregisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D11_DEVICE_LIST_ALL", + ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "CU_D3D11_DEVICE_LIST_CURRENT_FRAME", + ( + "HIP_D3D11_DEVICE_LIST_CURRENT_FRAME", + CONV_D3D11, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "CU_D3D11_DEVICE_LIST_NEXT_FRAME", + ( + "HIP_D3D11_DEVICE_LIST_NEXT_FRAME", + CONV_D3D11, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuD3D11GetDevice", + ("hipD3D11GetDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D11GetDevices", + ("hipD3D11GetDevices", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsD3D11RegisterResource", + ( + "hipGraphicsD3D11RegisterResource", + CONV_D3D11, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuD3D11CtxCreate", + ("hipD3D11CtxCreate", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D11CtxCreateOnDevice", + ("hipD3D11CtxCreateOnDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuD3D11GetDirect3DDevice", + ("hipD3D11GetDirect3DDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsVDPAURegisterOutputSurface", + ( + "hipGraphicsVDPAURegisterOutputSurface", + CONV_VDPAU, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuGraphicsVDPAURegisterVideoSurface", + ( + "hipGraphicsVDPAURegisterVideoSurface", + CONV_VDPAU, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuVDPAUGetDevice", + ("hipVDPAUGetDevice", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuVDPAUCtxCreate", + ("hipVDPAUCtxCreate", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamConsumerAcquireFrame", + ("hipEGLStreamConsumerAcquireFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamConsumerConnect", + ("hipEGLStreamConsumerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamConsumerConnectWithFlags", + ( + "hipEGLStreamConsumerConnectWithFlags", + CONV_EGL, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ( + "cuEGLStreamConsumerDisconnect", + ("hipEGLStreamConsumerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamConsumerReleaseFrame", + ("hipEGLStreamConsumerReleaseFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamProducerConnect", + ("hipEGLStreamProducerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamProducerDisconnect", + ("hipEGLStreamProducerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamProducerPresentFrame", + ("hipEGLStreamProducerPresentFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuEGLStreamProducerReturnFrame", + ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsEGLRegisterImage", + ("hipGraphicsEGLRegisterImage", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED), + ), + ( + "cuGraphicsResourceGetMappedEglFrame", + ( + "hipGraphicsResourceGetMappedEglFrame", + CONV_EGL, + API_DRIVER, + HIP_UNSUPPORTED, + ), + ), + ("cudaDataType_t", ("hipDataType", CONV_TYPE, API_RUNTIME)), + ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_16BF", ("HIP_R_16BF", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_16BF", ("HIP_C_16BF", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_4I", ("HIP_R_4I", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_4I", ("HIP_C_4I", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_4U", ("HIP_R_4U", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_4U", ("HIP_C_4U", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_16I", ("HIP_R_16I", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_16I", ("HIP_C_16I", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_16U", ("HIP_R_16U", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_16U", ("HIP_C_16U", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_64I", ("HIP_R_64I", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_64I", ("HIP_C_64I", CONV_TYPE, API_RUNTIME)), + ("CUDA_R_64U", ("HIP_R_64U", CONV_TYPE, API_RUNTIME)), + ("CUDA_C_64U", ("HIP_C_64U", CONV_TYPE, API_RUNTIME)), + ( + "MAJOR_VERSION", + ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "MINOR_VERSION", + ("hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "PATCH_LEVEL", + ("hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemAttachGlobal", + ("hipMemAttachGlobal", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemAttachHost", + ("hipMemAttachHost", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemAttachSingle", + ("hipMemAttachSingle", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaOccupancyDefault", + ("hipOccupancyDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaOccupancyDisableCachingOverride", + ( + "hipOccupancyDisableCachingOverride", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ("cudaGetLastError", ("hipGetLastError", CONV_ERROR, API_RUNTIME)), + ("cudaPeekAtLastError", ("hipPeekAtLastError", CONV_ERROR, API_RUNTIME)), + ("cudaGetErrorName", ("hipGetErrorName", CONV_ERROR, API_RUNTIME)), + ("cudaGetErrorString", ("hipGetErrorString", CONV_ERROR, API_RUNTIME)), + ("cudaMemcpy3DParms", ("hipMemcpy3DParms", CONV_MEM, API_RUNTIME)), + ( + "cudaMemcpy3DPeerParms", + ("hipMemcpy3DPeerParms", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMemcpy", ("hipMemcpy", CONV_MEM, API_RUNTIME)), + ("cudaMemcpyToArray", ("hipMemcpyToArray", CONV_MEM, API_RUNTIME)), + ("cudaMemcpyToSymbol", ("hipMemcpyToSymbol", CONV_MEM, API_RUNTIME)), + ("cudaMemcpyToSymbolAsync", ("hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME)), + ("cudaMemcpyAsync", ("hipMemcpyAsync", CONV_MEM, API_RUNTIME)), + ("cudaMemcpy2D", ("hipMemcpy2D", CONV_MEM, API_RUNTIME)), + ("cudaMemcpy2DAsync", ("hipMemcpy2DAsync", CONV_MEM, API_RUNTIME)), + ("cudaMemcpy2DToArray", ("hipMemcpy2DToArray", CONV_MEM, API_RUNTIME)), + ( + "cudaMemcpy2DArrayToArray", + ("hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemcpy2DFromArray", + ("hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemcpy2DFromArrayAsync", + ("hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemcpy2DToArrayAsync", + ("hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMemcpy3D", ("hipMemcpy3D", CONV_MEM, API_RUNTIME)), + ( + "cudaMemcpy3DAsync", + ("hipMemcpy3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemcpy3DPeer", + ("hipMemcpy3DPeer", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemcpy3DPeerAsync", + ("hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemcpyArrayToArray", + ("hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemcpyFromArrayAsync", + ("hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMemcpyFromSymbol", ("hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME)), + ( + "cudaMemcpyFromSymbolAsync", + ("hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME), + ), + ("cudaMemAdvise", ("hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)), + ( + "cudaMemRangeGetAttribute", + ("hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemRangeGetAttributes", + ("hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemAdviseSetReadMostly", + ("hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemAdviseUnsetReadMostly", + ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemAdviseSetPreferredLocation", + ( + "hipMemAdviseSetPreferredLocation", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaMemAdviseUnsetPreferredLocation", + ( + "hipMemAdviseUnsetPreferredLocation", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaMemAdviseSetAccessedBy", + ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemAdviseUnsetAccessedBy", + ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemRangeAttributeReadMostly", + ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemRangeAttributePreferredLocation", + ( + "hipMemRangeAttributePreferredLocation", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaMemRangeAttributeAccessedBy", + ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemRangeAttributeLastPrefetchLocation", + ( + "hipMemRangeAttributeLastPrefetchLocation", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ("cudaMemcpyHostToHost", ("hipMemcpyHostToHost", CONV_MEM, API_RUNTIME)), + ("cudaMemcpyHostToDevice", ("hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME)), + ("cudaMemcpyDeviceToHost", ("hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME)), + ( + "cudaMemcpyDeviceToDevice", + ("hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME), + ), + ("cudaMemcpyDefault", ("hipMemcpyDefault", CONV_MEM, API_RUNTIME)), + ("cudaMemset", ("hipMemset", CONV_MEM, API_RUNTIME)), + ("cudaMemsetAsync", ("hipMemsetAsync", CONV_MEM, API_RUNTIME)), + ("cudaMemset2D", ("hipMemset2D", CONV_MEM, API_RUNTIME)), + ( + "cudaMemset2DAsync", + ("hipMemset2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMemset3D", ("hipMemset3D", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)), + ( + "cudaMemset3DAsync", + ("hipMemset3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMemGetInfo", ("hipMemGetInfo", CONV_MEM, API_RUNTIME)), + ( + "cudaArrayGetInfo", + ("hipArrayGetInfo", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaFreeMipmappedArray", + ("hipFreeMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGetMipmappedArrayLevel", + ("hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGetSymbolAddress", + ("hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGetSymbolSize", + ("hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMemPrefetchAsync", + ("hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMallocHost", ("hipHostMalloc", CONV_MEM, API_RUNTIME)), + ("cudaMallocArray", ("hipMallocArray", CONV_MEM, API_RUNTIME)), + ("cudaMalloc", ("hipMalloc", CONV_MEM, API_RUNTIME)), + ("cudaMalloc3D", ("hipMalloc3D", CONV_MEM, API_RUNTIME)), + ("cudaMalloc3DArray", ("hipMalloc3DArray", CONV_MEM, API_RUNTIME)), + ( + "cudaMallocManaged", + ("hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaMallocMipmappedArray", + ("hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaMallocPitch", ("hipMallocPitch", CONV_MEM, API_RUNTIME)), + ("cudaFreeHost", ("hipHostFree", CONV_MEM, API_RUNTIME)), + ("cudaFreeArray", ("hipFreeArray", CONV_MEM, API_RUNTIME)), + ("cudaFree", ("hipFree", CONV_MEM, API_RUNTIME)), + ("cudaHostRegister", ("hipHostRegister", CONV_MEM, API_RUNTIME)), + ("cudaHostUnregister", ("hipHostUnregister", CONV_MEM, API_RUNTIME)), + ("cudaHostAlloc", ("hipHostMalloc", CONV_MEM, API_RUNTIME)), + ("cudaMemoryTypeHost", ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME)), + ("cudaMemoryTypeDevice", ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME)), + ("make_cudaExtent", ("make_hipExtent", CONV_MEM, API_RUNTIME)), + ("make_cudaPitchedPtr", ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME)), + ("make_cudaPos", ("make_hipPos", CONV_MEM, API_RUNTIME)), + ("cudaHostAllocDefault", ("hipHostMallocDefault", CONV_MEM, API_RUNTIME)), + ("cudaHostAllocPortable", ("hipHostMallocPortable", CONV_MEM, API_RUNTIME)), + ("cudaHostAllocMapped", ("hipHostMallocMapped", CONV_MEM, API_RUNTIME)), + ( + "cudaHostAllocWriteCombined", + ("hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME), + ), + ("cudaHostGetFlags", ("hipHostGetFlags", CONV_MEM, API_RUNTIME)), + ("cudaHostRegisterDefault", ("hipHostRegisterDefault", CONV_MEM, API_RUNTIME)), + ( + "cudaHostRegisterPortable", + ("hipHostRegisterPortable", CONV_MEM, API_RUNTIME), + ), + ("cudaHostRegisterMapped", ("hipHostRegisterMapped", CONV_MEM, API_RUNTIME)), + ( + "cudaHostRegisterIoMemory", + ("hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME), + ), + # ("warpSize", ("hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME), (HIP actually uses warpSize...)), + ("cudaEventCreate", ("hipEventCreate", CONV_EVENT, API_RUNTIME)), + ( + "cudaEventCreateWithFlags", + ("hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME), + ), + ("cudaEventDestroy", ("hipEventDestroy", CONV_EVENT, API_RUNTIME)), + ("cudaEventRecord", ("hipEventRecord", CONV_EVENT, API_RUNTIME)), + ("cudaEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_RUNTIME)), + ("cudaEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_RUNTIME)), + ("cudaEventQuery", ("hipEventQuery", CONV_EVENT, API_RUNTIME)), + ("cudaEventDefault", ("hipEventDefault", CONV_EVENT, API_RUNTIME)), + ("cudaEventBlockingSync", ("hipEventBlockingSync", CONV_EVENT, API_RUNTIME)), + ("cudaEventDisableTiming", ("hipEventDisableTiming", CONV_EVENT, API_RUNTIME)), + ("cudaEventInterprocess", ("hipEventInterprocess", CONV_EVENT, API_RUNTIME)), + ("cudaStreamCreate", ("hipStreamCreate", CONV_STREAM, API_RUNTIME)), + ( + "cudaStreamCreateWithFlags", + ("hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME), + ), + ( + "cudaStreamCreateWithPriority", + ("hipStreamCreateWithPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaStreamDestroy", ("hipStreamDestroy", CONV_STREAM, API_RUNTIME)), + ("cudaStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_RUNTIME)), + ("cudaStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_RUNTIME)), + ("cudaStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_RUNTIME)), + ("cudaStreamQuery", ("hipStreamQuery", CONV_STREAM, API_RUNTIME)), + ("cudaStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_RUNTIME)), + ( + "cudaStreamAttachMemAsync", + ("hipStreamAttachMemAsync", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaStreamGetPriority", + ("hipStreamGetPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaCpuDeviceId", ("hipCpuDeviceId", CONV_TYPE, API_RUNTIME)), + ("cudaStreamDefault", ("hipStreamDefault", CONV_TYPE, API_RUNTIME)), + ("cudaStreamNonBlocking", ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME)), + ( + "cudaStreamGetCaptureInfo", + ("hipStreamGetCaptureInfo", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaStreamGetCaptureInfo_v2", + ("hipStreamGetCaptureInfo_v2", CONV_TYPE, API_RUNTIME), + ), + ("cudaStreamCaptureStatus", ("hipStreamCaptureStatus", CONV_TYPE, API_RUNTIME)), + ( + "cudaStreamCaptureStatusActive", + ("hipStreamCaptureStatusActive", CONV_TYPE, API_RUNTIME), + ), + ("cudaStreamCaptureMode", ("hipStreamCaptureMode", CONV_TYPE, API_RUNTIME)), + ( + "cudaStreamCaptureModeGlobal", + ("hipStreamCaptureModeGlobal", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaStreamCaptureModeRelaxed", + ("hipStreamCaptureModeRelaxed", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaStreamCaptureModeThreadLocal", + ("hipStreamCaptureModeThreadLocal", CONV_TYPE, API_RUNTIME), + ), + ("cudaStreamBeginCapture", ("hipStreamBeginCapture", CONV_TYPE, API_RUNTIME)), + ("cudaStreamEndCapture", ("hipStreamEndCapture", CONV_TYPE, API_RUNTIME)), + ("cudaGraphInstantiate", ("hipGraphInstantiate", CONV_TYPE, API_RUNTIME)), + ( + "cudaGraphInstantiateWithFlags", + ("hipGraphInstantiateWithFlags", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaGraphInstantiateFlagAutoFreeOnLaunch", + ("hipGraphInstantiateFlagAutoFreeOnLaunch", CONV_TYPE, API_RUNTIME), + ), + ("cudaGraphDestroy", ("hipGraphDestroy", CONV_TYPE, API_RUNTIME)), + ("cudaGraphExecDestroy", ("hipGraphExecDestroy", CONV_TYPE, API_RUNTIME)), + ("cudaGraphLaunch", ("hipGraphLaunch", CONV_TYPE, API_RUNTIME)), + ("cudaGraphGetNodes", ("hipGraphGetNodes", CONV_TYPE, API_RUNTIME)), + ("cudaGraphDebugDotPrint", ("hipGraphDebugDotPrint", CONV_TYPE, API_RUNTIME)), + ( + "cudaGraphDebugDotFlagsVerbose", + ("hipGraphDebugDotFlagsVerbose", CONV_NUMERIC_LITERAL, API_RUNTIME), + ), + ( + "cudaGraphRetainUserObject", + ("hipGraphRetainUserObject", CONV_TYPE, API_RUNTIME), + ), + ("cudaGraphUserObjectMove", ("hipGraphUserObjectMove", CONV_TYPE, API_RUNTIME)), + ("cudaUserObject_t", ("hipUserObject_t", CONV_TYPE, API_RUNTIME)), + ("cudaUserObjectCreate", ("hipUserObjectCreate", CONV_TYPE, API_RUNTIME)), + ( + "cudaUserObjectNoDestructorSync", + ("hipUserObjectNoDestructorSync", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaThreadExchangeStreamCaptureMode", + ("hipThreadExchangeStreamCaptureMode", CONV_TYPE, API_RUNTIME), + ), + ("cudaStreamIsCapturing", ("hipStreamIsCapturing", CONV_TYPE, API_RUNTIME)), + ("cudaDeviceSynchronize", ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME)), + ("cudaDeviceReset", ("hipDeviceReset", CONV_DEVICE, API_RUNTIME)), + ("cudaSetDevice", ("hipSetDevice", CONV_DEVICE, API_RUNTIME)), + ("cudaGetDevice", ("hipGetDevice", CONV_DEVICE, API_RUNTIME)), + ("cudaGetDeviceCount", ("hipGetDeviceCount", CONV_DEVICE, API_RUNTIME)), + ("cudaChooseDevice", ("hipChooseDevice", CONV_DEVICE, API_RUNTIME)), + ("cudaThreadExit", ("hipDeviceReset", CONV_THREAD, API_RUNTIME)), + ( + "cudaThreadGetCacheConfig", + ("hipDeviceGetCacheConfig", CONV_THREAD, API_RUNTIME), + ), + ( + "cudaThreadGetLimit", + ("hipThreadGetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaThreadSetCacheConfig", + ("hipDeviceSetCacheConfig", CONV_THREAD, API_RUNTIME), + ), + ( + "cudaThreadSetLimit", + ("hipThreadSetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaThreadSynchronize", ("hipDeviceSynchronize", CONV_THREAD, API_RUNTIME)), + ("cudaDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_RUNTIME)), + ( + "cudaDevAttrMaxThreadsPerBlock", + ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxBlockDimX", + ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxBlockDimY", + ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxBlockDimZ", + ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxGridDimX", + ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxGridDimY", + ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxGridDimZ", + ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxSharedMemoryPerBlock", + ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxSharedMemoryPerBlockOptin", + ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrTotalConstantMemory", + ("hipDeviceAttributeTotalConstantMemory", CONV_TYPE, API_RUNTIME), + ), + ("cudaDevAttrWarpSize", ("hipDeviceAttributeWarpSize", CONV_TYPE, API_RUNTIME)), + ( + "cudaDevAttrMaxPitch", + ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaDevAttrMaxRegistersPerBlock", + ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrClockRate", + ("hipDeviceAttributeClockRate", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrTextureAlignment", + ( + "hipDeviceAttributeTextureAlignment", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrGpuOverlap", + ("hipDeviceAttributeGpuOverlap", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaDevAttrMultiProcessorCount", + ("hipDeviceAttributeMultiprocessorCount", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrKernelExecTimeout", + ( + "hipDeviceAttributeKernelExecTimeout", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrIntegrated", + ("hipDeviceAttributeIntegrated", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaDevAttrCanMapHostMemory", + ( + "hipDeviceAttributeCanMapHostMemory", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrComputeMode", + ("hipDeviceAttributeComputeMode", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxTexture1DWidth", + ( + "hipDeviceAttributeMaxTexture1DWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DWidth", + ( + "hipDeviceAttributeMaxTexture2DWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DHeight", + ( + "hipDeviceAttributeMaxTexture2DHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture3DWidth", + ( + "hipDeviceAttributeMaxTexture3DWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture3DHeight", + ( + "hipDeviceAttributeMaxTexture3DHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture3DDepth", + ( + "hipDeviceAttributeMaxTexture3DDepth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DLayeredWidth", + ( + "hipDeviceAttributeMaxTexture2DLayeredWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DLayeredHeight", + ( + "hipDeviceAttributeMaxTexture2DLayeredHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DLayeredLayers", + ( + "hipDeviceAttributeMaxTexture2DLayeredLayers", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrSurfaceAlignment", + ( + "hipDeviceAttributeSurfaceAlignment", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrConcurrentKernels", + ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrEccEnabled", + ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaDevAttrPciBusId", ("hipDeviceAttributePciBusId", CONV_TYPE, API_RUNTIME)), + ( + "cudaDevAttrPciDeviceId", + ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrTccDriver", + ("hipDeviceAttributeTccDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaDevAttrMemoryClockRate", + ("hipDeviceAttributeMemoryClockRate", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrGlobalMemoryBusWidth", + ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrL2CacheSize", + ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxThreadsPerMultiProcessor", + ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrAsyncEngineCount", + ( + "hipDeviceAttributeAsyncEngineCount", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrUnifiedAddressing", + ( + "hipDeviceAttributeUnifiedAddressing", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture1DLayeredWidth", + ( + "hipDeviceAttributeMaxTexture1DLayeredWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture1DLayeredLayers", + ( + "hipDeviceAttributeMaxTexture1DLayeredLayers", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DGatherWidth", + ( + "hipDeviceAttributeMaxTexture2DGatherWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DGatherHeight", + ( + "hipDeviceAttributeMaxTexture2DGatherHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture3DWidthAlt", + ( + "hipDeviceAttributeMaxTexture3DWidthAlternate", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture3DHeightAlt", + ( + "hipDeviceAttributeMaxTexture3DHeightAlternate", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture3DDepthAlt", + ( + "hipDeviceAttributeMaxTexture3DDepthAlternate", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrPciDomainId", + ("hipDeviceAttributePciDomainId", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaDevAttrTexturePitchAlignment", + ( + "hipDeviceAttributeTexturePitchAlignment", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTextureCubemapWidth", + ( + "hipDeviceAttributeMaxTextureCubemapWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTextureCubemapLayeredWidth", + ( + "hipDeviceAttributeMaxTextureCubemapLayeredWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTextureCubemapLayeredLayers", + ( + "hipDeviceAttributeMaxTextureCubemapLayeredLayers", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface1DWidth", + ( + "hipDeviceAttributeMaxSurface1DWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface2DWidth", + ( + "hipDeviceAttributeMaxSurface2DWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface2DHeight", + ( + "hipDeviceAttributeMaxSurface2DHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface3DWidth", + ( + "hipDeviceAttributeMaxSurface3DWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface3DHeight", + ( + "hipDeviceAttributeMaxSurface3DHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface3DDepth", + ( + "hipDeviceAttributeMaxSurface3DDepth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface1DLayeredWidth", + ( + "hipDeviceAttributeMaxSurface1DLayeredWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface1DLayeredLayers", + ( + "hipDeviceAttributeMaxSurface1DLayeredLayers", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface2DLayeredWidth", + ( + "hipDeviceAttributeMaxSurface2DLayeredWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface2DLayeredHeight", + ( + "hipDeviceAttributeMaxSurface2DLayeredHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurface2DLayeredLayers", + ( + "hipDeviceAttributeMaxSurface2DLayeredLayers", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurfaceCubemapWidth", + ( + "hipDeviceAttributeMaxSurfaceCubemapWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurfaceCubemapLayeredWidth", + ( + "hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSurfaceCubemapLayeredLayers", + ( + "hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture1DLinearWidth", + ( + "hipDeviceAttributeMaxTexture1DLinearWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DLinearWidth", + ( + "hipDeviceAttributeMaxTexture2DLinearWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DLinearHeight", + ( + "hipDeviceAttributeMaxTexture2DLinearHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DLinearPitch", + ( + "hipDeviceAttributeMaxTexture2DLinearPitch", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DMipmappedWidth", + ( + "hipDeviceAttributeMaxTexture2DMipmappedWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxTexture2DMipmappedHeight", + ( + "hipDeviceAttributeMaxTexture2DMipmappedHeight", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrComputeCapabilityMajor", + ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrComputeCapabilityMinor", + ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMaxTexture1DMipmappedWidth", + ( + "hipDeviceAttributeMaxTexture1DMipmappedWidth", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrStreamPrioritiesSupported", + ( + "hipDeviceAttributeStreamPrioritiesSupported", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrGlobalL1CacheSupported", + ( + "hipDeviceAttributeGlobalL1CacheSupported", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrLocalL1CacheSupported", + ( + "hipDeviceAttributeLocalL1CacheSupported", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrMaxSharedMemoryPerMultiprocessor", + ( + "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", + CONV_TYPE, + API_RUNTIME, + ), + ), + ( + "cudaDevAttrMaxRegistersPerMultiprocessor", + ( + "hipDeviceAttributeMaxRegistersPerMultiprocessor", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrManagedMemory", + ( + "hipDeviceAttributeManagedMemory", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrIsMultiGpuBoard", + ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDevAttrMultiGpuBoardGroupID", + ( + "hipDeviceAttributeMultiGpuBoardGroupID", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrHostNativeAtomicSupported", + ( + "hipDeviceAttributeHostNativeAtomicSupported", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrSingleToDoublePrecisionPerfRatio", + ( + "hipDeviceAttributeSingleToDoublePrecisionPerfRatio", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrPageableMemoryAccess", + ( + "hipDeviceAttributePageableMemoryAccess", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrConcurrentManagedAccess", + ( + "hipDeviceAttributeConcurrentManagedAccess", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrComputePreemptionSupported", + ( + "hipDeviceAttributeComputePreemptionSupported", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevAttrCanUseHostPointerForRegisteredMem", + ( + "hipDeviceAttributeCanUseHostPointerForRegisteredMem", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaPointerGetAttributes", + ("hipPointerGetAttributes", CONV_MEM, API_RUNTIME), + ), + ( + "cudaHostGetDevicePointer", + ("hipHostGetDevicePointer", CONV_MEM, API_RUNTIME), + ), + ( + "cudaGetDeviceProperties", + ("hipGetDeviceProperties", CONV_DEVICE, API_RUNTIME), + ), + ("cudaDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_RUNTIME)), + ( + "cudaDeviceGetByPCIBusId", + ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_RUNTIME), + ), + ( + "cudaDeviceGetStreamPriorityRange", + ( + "hipDeviceGetStreamPriorityRange", + CONV_DEVICE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaSetValidDevices", + ("hipSetValidDevices", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaDevP2PAttrPerformanceRank", + ( + "hipDeviceP2PAttributePerformanceRank", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevP2PAttrAccessSupported", + ( + "hipDeviceP2PAttributeAccessSupported", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDevP2PAttrNativeAtomicSupported", + ( + "hipDeviceP2PAttributeNativeAtomicSupported", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaDeviceGetP2PAttribute", + ("hipDeviceGetP2PAttribute", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaComputeModeDefault", + ("hipComputeModeDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaComputeModeExclusive", + ("hipComputeModeExclusive", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaComputeModeProhibited", + ("hipComputeModeProhibited", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaComputeModeExclusiveProcess", + ("hipComputeModeExclusiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGetDeviceFlags", + ("hipGetDeviceFlags", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaSetDeviceFlags", ("hipSetDeviceFlags", CONV_DEVICE, API_RUNTIME)), + ("cudaDeviceScheduleAuto", ("hipDeviceScheduleAuto", CONV_TYPE, API_RUNTIME)), + ("cudaDeviceScheduleSpin", ("hipDeviceScheduleSpin", CONV_TYPE, API_RUNTIME)), + ("cudaDeviceScheduleYield", ("hipDeviceScheduleYield", CONV_TYPE, API_RUNTIME)), + ( + "cudaDeviceBlockingSync", + ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDeviceScheduleBlockingSync", + ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDeviceScheduleMask", + ("hipDeviceScheduleMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaDeviceMapHost", ("hipDeviceMapHost", CONV_TYPE, API_RUNTIME)), + ( + "cudaDeviceLmemResizeToMax", + ("hipDeviceLmemResizeToMax", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaDeviceMask", ("hipDeviceMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)), + ( + "cudaDeviceSetCacheConfig", + ("hipDeviceSetCacheConfig", CONV_CACHE, API_RUNTIME), + ), + ( + "cudaDeviceGetCacheConfig", + ("hipDeviceGetCacheConfig", CONV_CACHE, API_RUNTIME), + ), + ( + "cudaFuncAttributes", + ("hipFuncAttributes", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaFuncAttributeMaxDynamicSharedMemorySize", + ("hipFuncAttributeMaxDynamicSharedMemorySize", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaFuncAttributePreferredSharedMemoryCarveout", + ("hipFuncAttributePreferredSharedMemoryCarveout", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaFuncSetAttribute", + ("hipFuncSetAttribute", CONV_EXEC, API_RUNTIME), + ), + ("cudaFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_CACHE, API_RUNTIME)), + ( + "cudaFuncCachePreferNone", + ("hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME), + ), + ( + "cudaFuncCachePreferShared", + ("hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME), + ), + ("cudaFuncCachePreferL1", ("hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME)), + ( + "cudaFuncCachePreferEqual", + ("hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME), + ), + ( + "cudaFuncGetAttributes", + ("hipFuncGetAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaFuncSetSharedMemConfig", + ("hipFuncSetSharedMemConfig", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGetParameterBuffer", + ("hipGetParameterBuffer", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaSetDoubleForDevice", + ("hipSetDoubleForDevice", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaSetDoubleForHost", + ("hipSetDoubleForHost", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaConfigureCall", + ("hipConfigureCall", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaLaunch", ("hipLaunch", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)), + ( + "cudaLaunchCooperativeKernel", + ("hipLaunchCooperativeKernel", CONV_EXEC, API_RUNTIME), + ), + ( + "cudaSetupArgument", + ("hipSetupArgument", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_RUNTIME)), + ( + "cudaRuntimeGetVersion", + ("hipRuntimeGetVersion", CONV_VERSION, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaOccupancyMaxPotentialBlockSize", + ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_RUNTIME), + ), + ( + "cudaOccupancyMaxPotentialBlockSizeWithFlags", + ( + "hipOccupancyMaxPotentialBlockSizeWithFlags", + CONV_OCCUPANCY, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaOccupancyMaxActiveBlocksPerMultiprocessor", + ( + "hipOccupancyMaxActiveBlocksPerMultiprocessor", + CONV_OCCUPANCY, + API_RUNTIME, + ), + ), + ( + "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + ( + "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + CONV_OCCUPANCY, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaOccupancyMaxPotentialBlockSizeVariableSMem", + ( + "hipOccupancyMaxPotentialBlockSizeVariableSMem", + CONV_OCCUPANCY, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags", + ( + "hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags", + CONV_OCCUPANCY, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ("cudaDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_RUNTIME)), + ( + "cudaDeviceDisablePeerAccess", + ("hipDeviceDisablePeerAccess", CONV_PEER, API_RUNTIME), + ), + ( + "cudaDeviceEnablePeerAccess", + ("hipDeviceEnablePeerAccess", CONV_PEER, API_RUNTIME), + ), + ("cudaMemcpyPeerAsync", ("hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME)), + ("cudaMemcpyPeer", ("hipMemcpyPeer", CONV_MEM, API_RUNTIME)), + ( + "cudaIpcMemLazyEnablePeerAccess", + ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaDeviceSetSharedMemConfig", + ("hipDeviceSetSharedMemConfig", CONV_DEVICE, API_RUNTIME), + ), + ( + "cudaDeviceGetSharedMemConfig", + ("hipDeviceGetSharedMemConfig", CONV_DEVICE, API_RUNTIME), + ), + ( + "cudaSharedMemBankSizeDefault", + ("hipSharedMemBankSizeDefault", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaSharedMemBankSizeFourByte", + ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaSharedMemBankSizeEightByte", + ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_RUNTIME), + ), + ( + "cudaLimitStackSize", + ("hipLimitStackSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaLimitPrintfFifoSize", + ("hipLimitPrintfFifoSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaLimitMallocHeapSize", ("hipLimitMallocHeapSize", CONV_TYPE, API_RUNTIME)), + ( + "cudaLimitDevRuntimeSyncDepth", + ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaLimitDevRuntimePendingLaunchCount", + ( + "hipLimitDevRuntimePendingLaunchCount", + CONV_TYPE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ("cudaDeviceGetLimit", ("hipDeviceGetLimit", CONV_DEVICE, API_RUNTIME)), + ( + "cudaProfilerInitialize", + ("hipProfilerInitialize", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaProfilerStart", ("hipProfilerStart", CONV_OTHER, API_RUNTIME)), + ("cudaProfilerStop", ("hipProfilerStop", CONV_OTHER, API_RUNTIME)), + ( + "cudaKeyValuePair", + ("hipKeyValuePair", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED), + ), + ("cudaCSV", ("hipCSV", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)), + ("cudaReadModeElementType", ("hipReadModeElementType", CONV_TEX, API_RUNTIME)), + ( + "cudaReadModeNormalizedFloat", + ("hipReadModeNormalizedFloat", CONV_TEX, API_RUNTIME), + ), + ("cudaFilterModePoint", ("hipFilterModePoint", CONV_TEX, API_RUNTIME)), + ("cudaFilterModeLinear", ("hipFilterModeLinear", CONV_TEX, API_RUNTIME)), + ("cudaBindTexture", ("hipBindTexture", CONV_TEX, API_RUNTIME)), + ("cudaUnbindTexture", ("hipUnbindTexture", CONV_TEX, API_RUNTIME)), + ("cudaBindTexture2D", ("hipBindTexture2D", CONV_TEX, API_RUNTIME)), + ("cudaBindTextureToArray", ("hipBindTextureToArray", CONV_TEX, API_RUNTIME)), + ( + "cudaBindTextureToMipmappedArray", + ("hipBindTextureToMipmappedArray", CONV_TEX, API_RUNTIME), + ), + ( + "cudaGetTextureAlignmentOffset", + ("hipGetTextureAlignmentOffset", CONV_TEX, API_RUNTIME), + ), + ("cudaGetTextureReference", ("hipGetTextureReference", CONV_TEX, API_RUNTIME)), + ( + "cudaChannelFormatKindSigned", + ("hipChannelFormatKindSigned", CONV_TEX, API_RUNTIME), + ), + ( + "cudaChannelFormatKindUnsigned", + ("hipChannelFormatKindUnsigned", CONV_TEX, API_RUNTIME), + ), + ( + "cudaChannelFormatKindFloat", + ("hipChannelFormatKindFloat", CONV_TEX, API_RUNTIME), + ), + ( + "cudaChannelFormatKindNone", + ("hipChannelFormatKindNone", CONV_TEX, API_RUNTIME), + ), + ("cudaCreateChannelDesc", ("hipCreateChannelDesc", CONV_TEX, API_RUNTIME)), + ("cudaGetChannelDesc", ("hipGetChannelDesc", CONV_TEX, API_RUNTIME)), + ("cudaResourceTypeArray", ("hipResourceTypeArray", CONV_TEX, API_RUNTIME)), + ( + "cudaResourceTypeMipmappedArray", + ("hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME), + ), + ("cudaResourceTypeLinear", ("hipResourceTypeLinear", CONV_TEX, API_RUNTIME)), + ("cudaResourceTypePitch2D", ("hipResourceTypePitch2D", CONV_TEX, API_RUNTIME)), + ("cudaResViewFormatNone", ("hipResViewFormatNone", CONV_TEX, API_RUNTIME)), + ( + "cudaResViewFormatUnsignedChar1", + ("hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedChar2", + ("hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedChar4", + ("hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedChar1", + ("hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedChar2", + ("hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedChar4", + ("hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedShort1", + ("hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedShort2", + ("hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedShort4", + ("hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedShort1", + ("hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedShort2", + ("hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedShort4", + ("hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedInt1", + ("hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedInt2", + ("hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedInt4", + ("hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedInt1", + ("hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedInt2", + ("hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedInt4", + ("hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME), + ), + ("cudaResViewFormatHalf1", ("hipResViewFormatHalf1", CONV_TEX, API_RUNTIME)), + ("cudaResViewFormatHalf2", ("hipResViewFormatHalf2", CONV_TEX, API_RUNTIME)), + ("cudaResViewFormatHalf4", ("hipResViewFormatHalf4", CONV_TEX, API_RUNTIME)), + ("cudaResViewFormatFloat1", ("hipResViewFormatFloat1", CONV_TEX, API_RUNTIME)), + ("cudaResViewFormatFloat2", ("hipResViewFormatFloat2", CONV_TEX, API_RUNTIME)), + ("cudaResViewFormatFloat4", ("hipResViewFormatFloat4", CONV_TEX, API_RUNTIME)), + ( + "cudaResViewFormatUnsignedBlockCompressed1", + ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedBlockCompressed2", + ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedBlockCompressed3", + ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedBlockCompressed4", + ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedBlockCompressed4", + ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedBlockCompressed5", + ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedBlockCompressed5", + ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedBlockCompressed6H", + ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatSignedBlockCompressed6H", + ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME), + ), + ( + "cudaResViewFormatUnsignedBlockCompressed7", + ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME), + ), + ("cudaAddressModeWrap", ("hipAddressModeWrap", CONV_TEX, API_RUNTIME)), + ("cudaAddressModeClamp", ("hipAddressModeClamp", CONV_TEX, API_RUNTIME)), + ("cudaAddressModeMirror", ("hipAddressModeMirror", CONV_TEX, API_RUNTIME)), + ("cudaAddressModeBorder", ("hipAddressModeBorder", CONV_TEX, API_RUNTIME)), + ("cudaCreateTextureObject", ("hipCreateTextureObject", CONV_TEX, API_RUNTIME)), + ( + "cudaDestroyTextureObject", + ("hipDestroyTextureObject", CONV_TEX, API_RUNTIME), + ), + ( + "cudaGetTextureObjectResourceDesc", + ("hipGetTextureObjectResourceDesc", CONV_TEX, API_RUNTIME), + ), + ( + "cudaGetTextureObjectResourceViewDesc", + ("hipGetTextureObjectResourceViewDesc", CONV_TEX, API_RUNTIME), + ), + ( + "cudaGetTextureObjectTextureDesc", + ("hipGetTextureObjectTextureDesc", CONV_TEX, API_RUNTIME), + ), + ( + "cudaBindSurfaceToArray", + ("hipBindSurfaceToArray", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGetSurfaceReference", + ("hipGetSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaBoundaryModeZero", + ("hipBoundaryModeZero", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaBoundaryModeClamp", + ("hipBoundaryModeClamp", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaBoundaryModeTrap", + ("hipBoundaryModeTrap", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaFormatModeForced", + ("hipFormatModeForced", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaFormatModeAuto", + ("hipFormatModeAuto", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaCreateSurfaceObject", + ("hipCreateSurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaDestroySurfaceObject", + ("hipDestroySurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGetSurfaceObjectResourceDesc", + ( + "hipGetSurfaceObjectResourceDesc", + CONV_SURFACE, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ("cudaIpcCloseMemHandle", ("hipIpcCloseMemHandle", CONV_DEVICE, API_RUNTIME)), + ("cudaIpcGetEventHandle", ("hipIpcGetEventHandle", CONV_DEVICE, API_RUNTIME)), + ("cudaIpcGetMemHandle", ("hipIpcGetMemHandle", CONV_DEVICE, API_RUNTIME)), + ("cudaIpcOpenEventHandle", ("hipIpcOpenEventHandle", CONV_DEVICE, API_RUNTIME)), + ("cudaIpcOpenMemHandle", ("hipIpcOpenMemHandle", CONV_DEVICE, API_RUNTIME)), + ( + "cudaGLGetDevices", + ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsGLRegisterBuffer", + ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsGLRegisterImage", + ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaWGLGetDevice", + ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsMapResources", + ("hipGraphicsMapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsResourceGetMappedMipmappedArray", + ( + "hipGraphicsResourceGetMappedMipmappedArray", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsResourceGetMappedPointer", + ( + "hipGraphicsResourceGetMappedPointer", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsResourceSetMapFlags", + ( + "hipGraphicsResourceSetMapFlags", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsSubResourceGetMappedArray", + ( + "hipGraphicsSubResourceGetMappedArray", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsUnmapResources", + ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsUnregisterResource", + ( + "hipGraphicsUnregisterResource", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsCubeFacePositiveX", + ( + "hipGraphicsCubeFacePositiveX", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsCubeFaceNegativeX", + ( + "hipGraphicsCubeFaceNegativeX", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsCubeFacePositiveY", + ( + "hipGraphicsCubeFacePositiveY", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsCubeFaceNegativeY", + ( + "hipGraphicsCubeFaceNegativeY", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsCubeFacePositiveZ", + ( + "hipGraphicsCubeFacePositiveZ", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsCubeFaceNegativeZ", + ( + "hipGraphicsCubeFaceNegativeZ", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsMapFlagsNone", + ("hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsMapFlagsReadOnly", + ( + "hipGraphicsMapFlagsReadOnly", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsMapFlagsWriteDiscard", + ( + "hipGraphicsMapFlagsWriteDiscard", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsRegisterFlagsNone", + ( + "hipGraphicsRegisterFlagsNone", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsRegisterFlagsReadOnly", + ( + "hipGraphicsRegisterFlagsReadOnly", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsRegisterFlagsWriteDiscard", + ( + "hipGraphicsRegisterFlagsWriteDiscard", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsRegisterFlagsSurfaceLoadStore", + ( + "hipGraphicsRegisterFlagsSurfaceLoadStore", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsRegisterFlagsTextureGather", + ( + "hipGraphicsRegisterFlagsTextureGather", + CONV_GRAPHICS, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGLDeviceListAll", + ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLDeviceListCurrentFrame", + ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLDeviceListNextFrame", + ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLGetDevices", + ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsGLRegisterBuffer", + ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsGLRegisterImage", + ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaWGLGetDevice", + ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLMapFlagsNone", + ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLMapFlagsReadOnly", + ( + "HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY", + CONV_GL, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGLMapFlagsWriteDiscard", + ( + "HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", + CONV_GL, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGLMapBufferObject", + ("hipGLMapBufferObject__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLMapBufferObjectAsync", + ("hipGLMapBufferObjectAsync__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLRegisterBufferObject", + ("hipGLRegisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLSetBufferObjectMapFlags", + ("hipGLSetBufferObjectMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLSetGLDevice", + ("hipGLSetGLDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLUnmapBufferObject", + ("hipGLUnmapBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLUnmapBufferObjectAsync", + ("hipGLUnmapBufferObjectAsync", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGLUnregisterBufferObject", + ("hipGLUnregisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9DeviceListAll", + ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9DeviceListCurrentFrame", + ( + "HIP_D3D9_DEVICE_LIST_CURRENT_FRAME", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9DeviceListNextFrame", + ( + "HIP_D3D9_DEVICE_LIST_NEXT_FRAME", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9GetDevice", + ("hipD3D9GetDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9GetDevices", + ("hipD3D9GetDevices", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9GetDirect3DDevice", + ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9SetDirect3DDevice", + ("hipD3D9SetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsD3D9RegisterResource", + ( + "hipGraphicsD3D9RegisterResource", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9MapFlags", + ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9MapFlagsNone", + ( + "HIP_D3D9_MAPRESOURCE_FLAGS_NONE", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9MapFlagsReadOnly", + ( + "HIP_D3D9_MAPRESOURCE_FLAGS_READONLY", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9MapFlagsWriteDiscard", + ( + "HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9RegisterFlagsNone", + ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9RegisterFlagsArray", + ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9MapResources", + ("hipD3D9MapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9RegisterResource", + ("hipD3D9RegisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9ResourceGetMappedArray", + ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9ResourceGetMappedPitch", + ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9ResourceGetMappedPointer", + ( + "hipD3D9ResourceGetMappedPointer", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9ResourceGetMappedSize", + ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9ResourceGetSurfaceDimensions", + ( + "hipD3D9ResourceGetSurfaceDimensions", + CONV_D3D9, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D9ResourceSetMapFlags", + ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9UnmapResources", + ("hipD3D9UnmapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D9UnregisterResource", + ("hipD3D9UnregisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10DeviceListAll", + ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10DeviceListCurrentFrame", + ( + "HIP_D3D10_DEVICE_LIST_CURRENT_FRAME", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10DeviceListNextFrame", + ( + "HIP_D3D10_DEVICE_LIST_NEXT_FRAME", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10GetDevice", + ("hipD3D10GetDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10GetDevices", + ("hipD3D10GetDevices", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsD3D10RegisterResource", + ( + "hipGraphicsD3D10RegisterResource", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10MapFlagsNone", + ( + "HIP_D3D10_MAPRESOURCE_FLAGS_NONE", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10MapFlagsReadOnly", + ( + "HIP_D3D10_MAPRESOURCE_FLAGS_READONLY", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10MapFlagsWriteDiscard", + ( + "HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10RegisterFlagsNone", + ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10RegisterFlagsArray", + ( + "HIP_D3D10_REGISTER_FLAGS_ARRAY", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10GetDirect3DDevice", + ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10MapResources", + ("hipD3D10MapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10RegisterResource", + ("hipD3D10RegisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10ResourceGetMappedArray", + ( + "hipD3D10ResourceGetMappedArray", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10ResourceGetMappedPitch", + ( + "hipD3D10ResourceGetMappedPitch", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10ResourceGetMappedPointer", + ( + "hipD3D10ResourceGetMappedPointer", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10ResourceGetMappedSize", + ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10ResourceGetSurfaceDimensions", + ( + "hipD3D10ResourceGetSurfaceDimensions", + CONV_D3D10, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D10ResourceSetMapFlags", + ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10SetDirect3DDevice", + ("hipD3D10SetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10UnmapResources", + ("hipD3D10UnmapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D10UnregisterResource", + ("hipD3D10UnregisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D11DeviceListAll", + ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D11DeviceListCurrentFrame", + ( + "HIP_D3D11_DEVICE_LIST_CURRENT_FRAME", + CONV_D3D11, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D11DeviceListNextFrame", + ( + "HIP_D3D11_DEVICE_LIST_NEXT_FRAME", + CONV_D3D11, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D11GetDevice", + ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D11GetDevices", + ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsD3D11RegisterResource", + ( + "hipGraphicsD3D11RegisterResource", + CONV_D3D11, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaD3D11GetDevice", + ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaD3D11GetDevices", + ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsD3D11RegisterResource", + ( + "hipGraphicsD3D11RegisterResource", + CONV_D3D11, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsVDPAURegisterOutputSurface", + ( + "hipGraphicsVDPAURegisterOutputSurface", + CONV_VDPAU, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaGraphicsVDPAURegisterVideoSurface", + ( + "hipGraphicsVDPAURegisterVideoSurface", + CONV_VDPAU, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaVDPAUGetDevice", + ("hipVDPAUGetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaVDPAUSetVDPAUDevice", + ("hipVDPAUSetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaEGLStreamConsumerAcquireFrame", + ( + "hipEGLStreamConsumerAcquireFrame", + CONV_EGL, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaEGLStreamConsumerConnect", + ("hipEGLStreamConsumerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaEGLStreamConsumerConnectWithFlags", + ( + "hipEGLStreamConsumerConnectWithFlags", + CONV_EGL, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaEGLStreamConsumerReleaseFrame", + ( + "hipEGLStreamConsumerReleaseFrame", + CONV_EGL, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaEGLStreamProducerConnect", + ("hipEGLStreamProducerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaEGLStreamProducerDisconnect", + ("hipEGLStreamProducerDisconnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaEGLStreamProducerPresentFrame", + ( + "hipEGLStreamProducerPresentFrame", + CONV_EGL, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ( + "cudaEGLStreamProducerReturnFrame", + ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsEGLRegisterImage", + ("hipGraphicsEGLRegisterImage", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED), + ), + ( + "cudaGraphicsResourceGetMappedEglFrame", + ( + "hipGraphicsResourceGetMappedEglFrame", + CONV_EGL, + API_RUNTIME, + HIP_UNSUPPORTED, + ), + ), + ("cublasInit", ("hipblasInit", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ( + "cublasShutdown", + ("hipblasShutdown", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasGetVersion", + ("hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasGetError", + ("hipblasGetError", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasAlloc", ("hipblasAlloc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasFree", ("hipblasFree", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ( + "cublasSetKernelStream", + ("hipblasSetKernelStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasGetAtomicsMode", + ("hipblasGetAtomicsMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSetAtomicsMode", + ("hipblasSetAtomicsMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasGetMathMode", + ("hipblasGetMathMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSetMathMode", + ("hipblasSetMathMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("CUBLAS_OP_N", ("HIPBLAS_OP_N", CONV_NUMERIC_LITERAL, API_BLAS)), + ( + "CUBLAS_OP_T", + ("HIPBLAS_OP_T", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_OP_C", + ("HIPBLAS_OP_C", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_SUCCESS", + ("HIPBLAS_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_NOT_INITIALIZED", + ("HIPBLAS_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_ALLOC_FAILED", + ("HIPBLAS_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_INVALID_VALUE", + ("HIPBLAS_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_MAPPING_ERROR", + ("HIPBLAS_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_EXECUTION_FAILED", + ("HIPBLAS_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_INTERNAL_ERROR", + ("HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_NOT_SUPPORTED", + ("HIPBLAS_STATUS_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_STATUS_ARCH_MISMATCH", + ("HIPBLAS_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_FILL_MODE_LOWER", + ("HIPBLAS_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_FILL_MODE_UPPER", + ("HIPBLAS_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_DIAG_NON_UNIT", + ("HIPBLAS_DIAG_NON_UNIT", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ("CUBLAS_DIAG_UNIT", ("HIPBLAS_DIAG_UNIT", CONV_NUMERIC_LITERAL, API_BLAS)), + ("CUBLAS_SIDE_LEFT", ("HIPBLAS_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_BLAS)), + ("CUBLAS_SIDE_RIGHT", ("HIPBLAS_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_BLAS)), + ( + "CUBLAS_POINTER_MODE_HOST", + ("HIPBLAS_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_POINTER_MODE_DEVICE", + ("HIPBLAS_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_ATOMICS_NOT_ALLOWED", + ( + "HIPBLAS_ATOMICS_NOT_ALLOWED", + CONV_NUMERIC_LITERAL, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ( + "CUBLAS_ATOMICS_ALLOWED", + ( + "HIPBLAS_ATOMICS_ALLOWED", + CONV_NUMERIC_LITERAL, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ( + "CUBLAS_DATA_FLOAT", + ( + "HIPBLAS_DATA_FLOAT", + CONV_NUMERIC_LITERAL, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ( + "CUBLAS_DATA_DOUBLE", + ( + "HIPBLAS_DATA_DOUBLE", + CONV_NUMERIC_LITERAL, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ( + "CUBLAS_DATA_HALF", + ("HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "CUBLAS_DATA_INT8", + ("HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "CUBLAS_GEMM_DEFAULT", + ("HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ( + "CUBLAS_GEMM_DEFAULT_TENSOR_OP", + ("HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS), + ), + ("cublasCreate", ("hipblasCreate", CONV_MATH_FUNC, API_BLAS)), + ("cublasDestroy", ("hipblasDestroy", CONV_MATH_FUNC, API_BLAS)), + ("cublasSetVector", ("hipblasSetVector", CONV_MATH_FUNC, API_BLAS)), + ("cublasGetVector", ("hipblasGetVector", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasSetVectorAsync", + ("hipblasSetVectorAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasGetVectorAsync", + ("hipblasGetVectorAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSetMatrix", ("hipblasSetMatrix", CONV_MATH_FUNC, API_BLAS)), + ("cublasGetMatrix", ("hipblasGetMatrix", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasGetMatrixAsync", + ("hipblasGetMatrixAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSetMatrixAsync", + ("hipblasSetMatrixAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasXerbla", ("hipblasXerbla", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSnrm2", ("hipblasSnrm2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDnrm2", ("hipblasDnrm2", CONV_MATH_FUNC, API_BLAS)), + ("cublasScnrm2", ("hipblasScnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDznrm2", ("hipblasDznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ( + "cublasNrm2Ex", + ("hipblasNrm2Ex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSdot", ("hipblasSdot", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasSdotBatched", + ("hipblasSdotBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasDdot", ("hipblasDdot", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasDdotBatched", + ("hipblasDdotBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasCdotu", ("hipblasCdotu", CONV_MATH_FUNC, API_BLAS)), + ("cublasCdotc", ("hipblasCdotc", CONV_MATH_FUNC, API_BLAS)), + ("cublasZdotu", ("hipblasZdotu", CONV_MATH_FUNC, API_BLAS)), + ("cublasZdotc", ("hipblasZdotc", CONV_MATH_FUNC, API_BLAS)), + ("cublasSscal", ("hipblasSscal", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasSscalBatched", + ("hipblasSscalBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasDscal", ("hipblasDscal", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasDscalBatched", + ("hipblasDscalBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasCscal", ("hipblasCscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCsscal", ("hipblasCsscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZscal", ("hipblasZscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZdscal", ("hipblasZdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSaxpy", ("hipblasSaxpy", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasSaxpyBatched", + ("hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasDaxpy", ("hipblasDaxpy", CONV_MATH_FUNC, API_BLAS)), + ("cublasCaxpy", ("hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZaxpy", ("hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasScopy", ("hipblasScopy", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasScopyBatched", + ("hipblasScopyBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasDcopy", ("hipblasDcopy", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasDcopyBatched", + ("hipblasDcopyBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasCcopy", ("hipblasCcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZcopy", ("hipblasZcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSswap", ("hipblasSswap", CONV_MATH_FUNC, API_BLAS)), + ("cublasDswap", ("hipblasDswap", CONV_MATH_FUNC, API_BLAS)), + ("cublasCswap", ("hipblasCswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZswap", ("hipblasZswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasIsamax", ("hipblasIsamax", CONV_MATH_FUNC, API_BLAS)), + ("cublasIdamax", ("hipblasIdamax", CONV_MATH_FUNC, API_BLAS)), + ("cublasIcamax", ("hipblasIcamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasIzamax", ("hipblasIzamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasIsamin", ("hipblasIsamin", CONV_MATH_FUNC, API_BLAS)), + ("cublasIdamin", ("hipblasIdamin", CONV_MATH_FUNC, API_BLAS)), + ("cublasIcamin", ("hipblasIcamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasIzamin", ("hipblasIzamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSasum", ("hipblasSasum", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasSasumBatched", + ("hipblasSasumBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasDasum", ("hipblasDasum", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasDasumBatched", + ("hipblasDasumBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasScasum", ("hipblasScasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDzasum", ("hipblasDzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSrot", ("hipblasSrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDrot", ("hipblasDrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCrot", ("hipblasCrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCsrot", ("hipblasCsrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZrot", ("hipblasZrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZdrot", ("hipblasZdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSrotg", ("hipblasSrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDrotg", ("hipblasDrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCrotg", ("hipblasCrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZrotg", ("hipblasZrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSrotm", ("hipblasSrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDrotm", ("hipblasDrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSrotmg", ("hipblasSrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDrotmg", ("hipblasDrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSgemv", ("hipblasSgemv", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasSgemvBatched", + ("hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasDgemv", ("hipblasDgemv", CONV_MATH_FUNC, API_BLAS)), + ("cublasCgemv", ("hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZgemv", ("hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSgbmv", ("hipblasSgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDgbmv", ("hipblasDgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCgbmv", ("hipblasCgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZgbmv", ("hipblasZgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStrmv", ("hipblasStrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtrmv", ("hipblasDtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtrmv", ("hipblasCtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtrmv", ("hipblasZtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStbmv", ("hipblasStbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtbmv", ("hipblasDtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtbmv", ("hipblasCtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtbmv", ("hipblasZtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStpmv", ("hipblasStpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtpmv", ("hipblasDtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtpmv", ("hipblasCtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtpmv", ("hipblasZtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStrsv", ("hipblasStrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtrsv", ("hipblasDtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtrsv", ("hipblasCtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtrsv", ("hipblasZtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStpsv", ("hipblasStpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtpsv", ("hipblasDtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtpsv", ("hipblasCtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtpsv", ("hipblasZtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStbsv", ("hipblasStbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtbsv", ("hipblasDtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtbsv", ("hipblasCtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtbsv", ("hipblasZtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSsymv", ("hipblasSsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDsymv", ("hipblasDsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCsymv", ("hipblasCsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZsymv", ("hipblasZsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasChemv", ("hipblasChemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZhemv", ("hipblasZhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSsbmv", ("hipblasSsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDsbmv", ("hipblasDsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasChbmv", ("hipblasChbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZhbmv", ("hipblasZhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSspmv", ("hipblasSspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDspmv", ("hipblasDspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasChpmv", ("hipblasChpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZhpmv", ("hipblasZhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSger", ("hipblasSger", CONV_MATH_FUNC, API_BLAS)), + ("cublasDger", ("hipblasDger", CONV_MATH_FUNC, API_BLAS)), + ("cublasCgeru", ("hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCgerc", ("hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZgeru", ("hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZgerc", ("hipblasZgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSsyr", ("hipblasSsyr", CONV_MATH_FUNC, API_BLAS)), + ("cublasDsyr", ("hipblasDsyr", CONV_MATH_FUNC, API_BLAS)), + ("cublasCher", ("hipblasCher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZher", ("hipblasZher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSspr", ("hipblasSspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDspr", ("hipblasDspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasChpr", ("hipblasChpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZhpr", ("hipblasZhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSsyr2", ("hipblasSsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDsyr2", ("hipblasDsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCher2", ("hipblasCher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZher2", ("hipblasZher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSspr2", ("hipblasSspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDspr2", ("hipblasDspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasChpr2", ("hipblasChpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZhpr2", ("hipblasZhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ( + "cublasSgemmBatched", + ("hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDgemmBatched", + ("hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasHgemmBatched", + ("hipblasHgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSgemmStridedBatched", + ("hipblasSgemmStridedBatched", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasDgemmStridedBatched", + ("hipblasDgemmStridedBatched", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasHgemmStridedBatched", + ("hipblasHgemmStridedBatched", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasCgemmBatched", + ("hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgemm3mBatched", + ("hipblasCgemm3mBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgemmBatched", + ("hipblasZgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgemmStridedBatched", + ( + "hipblasCgemmStridedBatched", + CONV_MATH_FUNC, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ( + "cublasCgemm3mStridedBatched", + ( + "hipblasCgemm3mStridedBatched", + CONV_MATH_FUNC, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ( + "cublasZgemmStridedBatched", + ( + "hipblasZgemmStridedBatched", + CONV_MATH_FUNC, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ( + "cublasHgemmStridedBatched", + ( + "hipblasHgemmStridedBatched", + CONV_MATH_FUNC, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ("cublasSgemm", ("hipblasSgemm", CONV_MATH_FUNC, API_BLAS)), + ("cublasDgemm", ("hipblasDgemm", CONV_MATH_FUNC, API_BLAS)), + ("cublasCgemm", ("hipblasCgemm", CONV_MATH_FUNC, API_BLAS)), + ("cublasZgemm", ("hipblasZgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasHgemm", ("hipblasHgemm", CONV_MATH_FUNC, API_BLAS)), + ("cublasSsyrk", ("hipblasSsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDsyrk", ("hipblasDsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCsyrk", ("hipblasCsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZsyrk", ("hipblasZsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCherk", ("hipblasCherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZherk", ("hipblasZherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSsyr2k", ("hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDsyr2k", ("hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCsyr2k", ("hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZsyr2k", ("hipblasZyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSsyrkx", ("hipblasSsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDsyrkx", ("hipblasDsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCsyrkx", ("hipblasCsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZsyrkx", ("hipblasZsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCher2k", ("hipblasCher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZher2k", ("hipblasZher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCherkx", ("hipblasCherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZherkx", ("hipblasZherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSsymm", ("hipblasSsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDsymm", ("hipblasDsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCsymm", ("hipblasCsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZsymm", ("hipblasZsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasChemm", ("hipblasChemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZhemm", ("hipblasZhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStrsm", ("hipblasStrsm", CONV_MATH_FUNC, API_BLAS)), + ("cublasDtrsm", ("hipblasDtrsm", CONV_MATH_FUNC, API_BLAS)), + ("cublasCtrsm", ("hipblasCtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtrsm", ("hipblasZtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ( + "cublasStrsmBatched", + ("hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtrsmBatched", + ("hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtrsmBatched", + ("hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtrsmBatched", + ("hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasStrmm", ("hipblasStrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtrmm", ("hipblasDtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtrmm", ("hipblasCtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtrmm", ("hipblasZtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSgeam", ("hipblasSgeam", CONV_MATH_FUNC, API_BLAS)), + ("cublasDgeam", ("hipblasDgeam", CONV_MATH_FUNC, API_BLAS)), + ("cublasCgeam", ("hipblasCgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZgeam", ("hipblasZgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ( + "cublasSgetrfBatched", + ("hipblasSgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDgetrfBatched", + ("hipblasDgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgetrfBatched", + ("hipblasCgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgetrfBatched", + ("hipblasZgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSgetriBatched", + ("hipblasSgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDgetriBatched", + ("hipblasDgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgetriBatched", + ("hipblasCgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgetriBatched", + ("hipblasZgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSgetrsBatched", + ("hipblasSgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDgetrsBatched", + ("hipblasDgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgetrsBatched", + ("hipblasCgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgetrsBatched", + ("hipblasZgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStrsmBatched", + ("hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtrsmBatched", + ("hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtrsmBatched", + ("hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtrsmBatched", + ("hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSmatinvBatched", + ("hipblasSmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDmatinvBatched", + ("hipblasDmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCmatinvBatched", + ("hipblasCmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZmatinvBatched", + ("hipblasZmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSgeqrfBatched", + ("hipblasSgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDgeqrfBatched", + ("hipblasDgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgeqrfBatched", + ("hipblasCgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgeqrfBatched", + ("hipblasZgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSgelsBatched", + ("hipblasSgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDgelsBatched", + ("hipblasDgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgelsBatched", + ("hipblasCgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgelsBatched", + ("hipblasZgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSdgmm", ("hipblasSdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDdgmm", ("hipblasDdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCdgmm", ("hipblasCdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZdgmm", ("hipblasZdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStpttr", ("hipblasStpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtpttr", ("hipblasDtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtpttr", ("hipblasCtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtpttr", ("hipblasZtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasStrttp", ("hipblasStrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDtrttp", ("hipblasDtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCtrttp", ("hipblasCtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasZtrttp", ("hipblasZtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasCreate_v2", ("hipblasCreate_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDestroy_v2", ("hipblasDestroy_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasGetVersion_v2", + ("hipblasGetVersion_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSetWorkspace", ("hipblasSetWorkspace", CONV_MATH_FUNC, API_BLAS)), + ("cublasSetStream", ("hipblasSetStream", CONV_MATH_FUNC, API_BLAS)), + ("cublasGetStream", ("hipblasGetStream", CONV_MATH_FUNC, API_BLAS)), + ("cublasSetStream_v2", ("hipblasSetStream_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasGetStream_v2", ("hipblasGetStream_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasGetPointerMode", + ("hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasSetPointerMode", + ("hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasGetPointerMode_v2", + ("hipblasGetPointerMode_v2", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasSetPointerMode_v2", + ("hipblasSetPointerMode_v2", CONV_MATH_FUNC, API_BLAS), + ), + ("cublasSgemv_v2", ("hipblasSgemv_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDgemv_v2", ("hipblasDgemv_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCgemv_v2", + ("hipblasCgemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgemv_v2", + ("hipblasZgemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSgbmv_v2", + ("hipblasSgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDgbmv_v2", + ("hipblasDgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgbmv_v2", + ("hipblasCgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgbmv_v2", + ("hipblasZgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStrmv_v2", + ("hipblasStrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtrmv_v2", + ("hipblasDtrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtrmv_v2", + ("hipblasCtrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtrmv_v2", + ("hipblasZtrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStbmv_v2", + ("hipblasStbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtbmv_v2", + ("hipblasDtbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtbmv_v2", + ("hipblasCtbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtbmv_v2", + ("hipblasZtbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStpmv_v2", + ("hipblasStpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtpmv_v2", + ("hipblasDtpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtpmv_v2", + ("hipblasCtpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtpmv_v2", + ("hipblasZtpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStrsv_v2", + ("hipblasStrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtrsv_v2", + ("hipblasDtrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtrsv_v2", + ("hipblasCtrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtrsv_v2", + ("hipblasZtrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStpsv_v2", + ("hipblasStpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtpsv_v2", + ("hipblasDtpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtpsv_v2", + ("hipblasCtpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtpsv_v2", + ("hipblasZtpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStbsv_v2", + ("hipblasStbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtbsv_v2", + ("hipblasDtbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtbsv_v2", + ("hipblasCtbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtbsv_v2", + ("hipblasZtbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSsymv_v2", + ("hipblasSsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDsymv_v2", + ("hipblasDsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsymv_v2", + ("hipblasCsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZsymv_v2", + ("hipblasZsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasChemv_v2", + ("hipblasChemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZhemv_v2", + ("hipblasZhemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSsbmv_v2", + ("hipblasSsbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDsbmv_v2", + ("hipblasDsbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasChbmv_v2", + ("hipblasChbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZhbmv_v2", + ("hipblasZhbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSspmv_v2", + ("hipblasSspmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDspmv_v2", + ("hipblasDspmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasChpmv_v2", + ("hipblasChpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZhpmv_v2", + ("hipblasZhpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSger_v2", ("hipblasSger_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDger_v2", ("hipblasDger_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCgeru_v2", + ("hipblasCgeru_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgerc_v2", + ("hipblasCergc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgeru_v2", + ("hipblasZgeru_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgerc_v2", + ("hipblasZgerc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSsyr_v2", + ("hipblasSsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDsyr_v2", + ("hipblasDsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsyr_v2", + ("hipblasCsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZsyr_v2", + ("hipblasZsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCher_v2", + ("hipblasCher_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZher_v2", + ("hipblasZher_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSspr_v2", + ("hipblasSspr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDspr_v2", + ("hipblasDspr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasChpr_v2", + ("hipblasChpr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZhpr_v2", + ("hipblasZhpr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSsyr2_v2", + ("hipblasSsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDsyr2_v2", + ("hipblasDsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsyr2_v2", + ("hipblasCsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZsyr2_v2", + ("hipblasZsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCher2_v2", + ("hipblasCher2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZher2_v2", + ("hipblasZher2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSspr2_v2", + ("hipblasSspr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDspr2_v2", + ("hipblasDspr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasChpr2_v2", + ("hipblasChpr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZhpr2_v2", + ("hipblasZhpr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSgemm_v2", ("hipblasSgemm_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDgemm_v2", ("hipblasDgemm_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCgemm_v2", + ("hipblasCgemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgemm3m", + ("hipblasCgemm3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgemm3mEx", + ("hipblasCgemm3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgemm_v2", + ("hipblasZgemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZgemm3m", + ("hipblasZgemm3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSgemmEx", + ("hipblasSgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasGemmEx", ("hipblasGemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ( + "cublasGemmBatchedEx", + ("hipblasGemmBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasGemmStridedBatchedEx", + ("hipblasGemmStridedBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCgemmEx", + ("hipblasCgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasUint8gemmBias", + ("hipblasUint8gemmBias", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSsyrk_v2", + ("hipblasSsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDsyrk_v2", + ("hipblasDsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsyrk_v2", + ("hipblasCsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZsyrk_v2", + ("hipblasZsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsyrkEx", + ("hipblasCsyrkEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsyrk3mEx", + ("hipblasCsyrk3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCherk_v2", + ("hipblasCherk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCherkEx", + ("hipblasCherkEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCherk3mEx", + ("hipblasCherk3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZherk_v2", + ("hipblasZherk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSsyr2k_v2", + ("hipblasSsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDsyr2k_v2", + ("hipblasDsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsyr2k_v2", + ("hipblasCsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZsyr2k_v2", + ("hipblasZsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCher2k_v2", + ("hipblasCher2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZher2k_v2", + ("hipblasZher2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSsymm_v2", + ("hipblasSsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDsymm_v2", + ("hipblasDsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsymm_v2", + ("hipblasCsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZsymm_v2", + ("hipblasZsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasChemm_v2", + ("hipblasChemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZhemm_v2", + ("hipblasZhemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStrsm_v2", + ("hipblasStrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtrsm_v2", + ("hipblasDtrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtrsm_v2", + ("hipblasCtrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtrsm_v2", + ("hipblasZtrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasStrmm_v2", + ("hipblasStrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDtrmm_v2", + ("hipblasDtrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCtrmm_v2", + ("hipblasCtrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZtrmm_v2", + ("hipblasZtrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSnrm2_v2", ("hipblasSnrm2_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDnrm2_v2", ("hipblasDnrm2_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasScnrm2_v2", + ("hipblasScnrm2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDznrm2_v2", + ("hipblasDznrm2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasDotEx", ("hipblasDotEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasDotcEx", ("hipblasDotcEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSdot_v2", ("hipblasSdot_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDdot_v2", ("hipblasDdot_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCdotu_v2", + ("hipblasCdotu_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCdotc_v2", + ("hipblasCdotc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZdotu_v2", + ("hipblasZdotu_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZdotc_v2", + ("hipblasZdotc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasScalEx", ("hipblasScalEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSscal_v2", ("hipblasSscal_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDscal_v2", ("hipblasDscal_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCscal_v2", + ("hipblasCscal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsscal_v2", + ("hipblasCsscal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZscal_v2", + ("hipblasZcsal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZdscal_v2", + ("hipblasZdscal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasAxpyEx", ("hipblasAxpyEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)), + ("cublasSaxpy_v2", ("hipblasSaxpy_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDaxpy_v2", ("hipblasDaxpy_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCaxpy_v2", + ("hipblasCaxpy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZaxpy_v2", + ("hipblasZaxpy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasScopy_v2", ("hipblasScopy_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDcopy_v2", ("hipblasDcopy_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCcopy_v2", + ("hipblasCcopy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZcopy_v2", + ("hipblasZcopy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSswap_v2", ("hipblasSswap_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDswap_v2", ("hipblasDswap_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasCswap_v2", + ("hipblasCswap_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZswap_v2", + ("hipblasZswap_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasIsamax_v2", ("hipblasIsamax_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasIdamax_v2", ("hipblasIdamax_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasIcamax_v2", + ("hipblasIcamax_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasIzamax_v2", + ("hipblasIzamax_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasIsamin_v2", ("hipblasIsamin_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasIdamin_v2", ("hipblasIdamin_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasIcamin_v2", + ("hipblasIcamin_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasIzamin_v2", + ("hipblasIzamin_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasSasum_v2", ("hipblasSasum_v2", CONV_MATH_FUNC, API_BLAS)), + ("cublasDasum_v2", ("hipblasDasum_v2", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasScasum_v2", + ("hipblasScasum_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDzasum_v2", + ("hipblasDzasum_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSrot_v2", + ("hipblasSrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDrot_v2", + ("hipblasDrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCrot_v2", + ("hipblasCrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCsrot_v2", + ("hipblasCsrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZrot_v2", + ("hipblasZrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZdrot_v2", + ("hipblasZdrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSrotg_v2", + ("hipblasSrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDrotg_v2", + ("hipblasDrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasCrotg_v2", + ("hipblasCrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasZrotg_v2", + ("hipblasZrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSrotm_v2", + ("hipblasSrotm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDrotm_v2", + ("hipblasDrotm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasSrotmg_v2", + ("hipblasSrotmg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ( + "cublasDrotmg_v2", + ("hipblasDrotmg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED), + ), + ("cublasComputeType_t", ("hipblasComputeType_t", CONV_MATH_FUNC, API_BLAS)), + ("CUBLAS_COMPUTE_32I", ("HIPBLAS_COMPUTE_32I", CONV_MATH_FUNC, API_BLAS)), + ("CUBLAS_COMPUTE_32F", ("HIPBLAS_COMPUTE_32F", CONV_MATH_FUNC, API_BLAS)), + ("CUBLAS_COMPUTE_64F", ("HIPBLAS_COMPUTE_64F", CONV_MATH_FUNC, API_BLAS)), + ("cublasLtEpilogue_t", ("hipblasLtEpilogue_t", CONV_MATH_FUNC, API_BLAS)), + ( + "CUBLASLT_EPILOGUE_DEFAULT", + ("HIPBLASLT_EPILOGUE_DEFAULT", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_EPILOGUE_RELU", + ("HIPBLASLT_EPILOGUE_RELU", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_EPILOGUE_BIAS", + ("HIPBLASLT_EPILOGUE_BIAS", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_EPILOGUE_RELU_BIAS", + ("HIPBLASLT_EPILOGUE_RELU_BIAS", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_EPILOGUE_GELU", + ("HIPBLASLT_EPILOGUE_GELU", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_EPILOGUE_GELU_BIAS", + ("HIPBLASLT_EPILOGUE_GELU_BIAS", CONV_MATH_FUNC, API_BLAS), + ), + ("cublasLtHandle_t", ("hipblasLtHandle_t", CONV_MATH_FUNC, API_BLAS)), + ("cublasLtMatmulDesc_t", ("hipblasLtMatmulDesc_t", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasLtMatmulDescOpaque_t", + ("hipblasLtMatmulDescOpaque_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulDescAttributes_t", + ("hipblasLtMatmulDescAttributes_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_TRANSA", + ("HIPBLASLT_MATMUL_DESC_TRANSA", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_TRANSB", + ("HIPBLASLT_MATMUL_DESC_TRANSB", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_EPILOGUE", + ("HIPBLASLT_MATMUL_DESC_EPILOGUE", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_BIAS_POINTER", + ("HIPBLASLT_MATMUL_DESC_BIAS_POINTER", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_A_SCALE_POINTER", + ("HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_B_SCALE_POINTER", + ("HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_D_SCALE_POINTER", + ("HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", + ("HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", + ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatrixLayout_t", + ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatrixLayoutOpaque_t", + ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatrixLayoutAttribute_t", + ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatrixLayoutCreate", + ("hipblasLtMatrixLayoutCreate", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatrixLayoutDestroy", + ("hipblasLtMatrixLayoutDestroy", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatrixLayoutSetAttribute", + ("hipblasLtMatrixLayoutSetAttribute", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT", + ("HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET", + ("HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulPreference_t", + ("hipblasLtMatmulPreference_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulPreferenceOpaque_t", + ("hipblasLtMatmulPreferenceOpaque_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulPreferenceAttributes_t", + ("hipblasLtMatmulPreferenceAttributes_t", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_PREF_SEARCH_MODE", + ("HIPBLASLT_MATMUL_PREF_SEARCH_MODE", CONV_MATH_FUNC, API_BLAS), + ), + ( + "CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", + ("HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", CONV_MATH_FUNC, API_BLAS), + ), + ("cublasLtMatmulAlgo_t", ("hipblasLtMatmulAlgo_t", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasLtMatmulHeuristicResult_t", + ("hipblasLtMatmulHeuristicResult_t", CONV_MATH_FUNC, API_BLAS), + ), + ("cublasLtCreate", ("hipblasLtCreate", CONV_MATH_FUNC, API_BLAS)), + ("cublasLtDestroy", ("hipblasLtDestroy", CONV_MATH_FUNC, API_BLAS)), + ( + "cublasLtMatmulDescCreate", + ("hipblasLtMatmulDescCreate", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulDescDestroy", + ("hipblasLtMatmulDescDestroy", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulDescSetAttribute", + ("hipblasLtMatmulDescSetAttribute", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulPreferenceCreate", + ("hipblasLtMatmulPreferenceCreate", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulPreferenceDestroy", + ("hipblasLtMatmulPreferenceDestroy", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulPreferenceSetAttribute", + ("hipblasLtMatmulPreferenceSetAttribute", CONV_MATH_FUNC, API_BLAS), + ), + ( + "cublasLtMatmulAlgoGetHeuristic", + ("hipblasLtMatmulAlgoGetHeuristic", CONV_MATH_FUNC, API_BLAS), + ), + ("cublasLtMatmul", ("hipblasLtMatmul", CONV_MATH_FUNC, API_BLAS)), + ( + "CURAND_STATUS_SUCCESS", + ("HIPRAND_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_VERSION_MISMATCH", + ("HIPRAND_STATUS_VERSION_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_NOT_INITIALIZED", + ("HIPRAND_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_ALLOCATION_FAILED", + ("HIPRAND_STATUS_ALLOCATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_TYPE_ERROR", + ("HIPRAND_STATUS_TYPE_ERROR", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_OUT_OF_RANGE", + ("HIPRAND_STATUS_OUT_OF_RANGE", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_LENGTH_NOT_MULTIPLE", + ("HIPRAND_STATUS_LENGTH_NOT_MULTIPLE", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED", + ( + "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED", + CONV_NUMERIC_LITERAL, + API_RAND, + ), + ), + ( + "CURAND_STATUS_LAUNCH_FAILURE", + ("HIPRAND_STATUS_LAUNCH_FAILURE", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_PREEXISTING_FAILURE", + ("HIPRAND_STATUS_PREEXISTING_FAILURE", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_INITIALIZATION_FAILED", + ("HIPRAND_STATUS_INITIALIZATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_ARCH_MISMATCH", + ("HIPRAND_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_STATUS_INTERNAL_ERROR", + ("HIPRAND_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_RAND), + ), + ("CURAND_RNG_TEST", ("HIPRAND_RNG_TEST", CONV_NUMERIC_LITERAL, API_RAND)), + ( + "mtgp32dc_params_fast_11213", + ("mtgp32dc_params_fast_11213", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_PSEUDO_DEFAULT", + ("HIPRAND_RNG_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_PSEUDO_XORWOW", + ("HIPRAND_RNG_PSEUDO_XORWOW", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_PSEUDO_MRG32K3A", + ("HIPRAND_RNG_PSEUDO_MRG32K3A", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_PSEUDO_MTGP32", + ("HIPRAND_RNG_PSEUDO_MTGP32", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_PSEUDO_MT19937", + ("HIPRAND_RNG_PSEUDO_MT19937", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_PSEUDO_PHILOX4_32_10", + ("HIPRAND_RNG_PSEUDO_PHILOX4_32_10", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_QUASI_DEFAULT", + ("HIPRAND_RNG_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_QUASI_SOBOL32", + ("HIPRAND_RNG_QUASI_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_QUASI_SCRAMBLED_SOBOL32", + ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_QUASI_SOBOL64", + ("HIPRAND_RNG_QUASI_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "CURAND_RNG_QUASI_SCRAMBLED_SOBOL64", + ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND), + ), + ( + "curand_ORDERING_PSEUDO_BEST", + ( + "HIPRAND_ORDERING_PSEUDO_BEST", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_ORDERING_PSEUDO_DEFAULT", + ( + "HIPRAND_ORDERING_PSEUDO_DEFAULT", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_ORDERING_PSEUDO_SEEDED", + ( + "HIPRAND_ORDERING_PSEUDO_SEEDED", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_ORDERING_QUASI_DEFAULT", + ( + "HIPRAND_ORDERING_QUASI_DEFAULT", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_DIRECTION_VECTORS_32_JOEKUO6", + ( + "HIPRAND_DIRECTION_VECTORS_32_JOEKUO6", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", + ( + "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_DIRECTION_VECTORS_64_JOEKUO6", + ( + "HIPRAND_DIRECTION_VECTORS_64_JOEKUO6", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", + ( + "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", + CONV_NUMERIC_LITERAL, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_CHOOSE_BEST", + ("HIPRAND_CHOOSE_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_ITR", + ("HIPRAND_ITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_KNUTH", + ("HIPRAND_KNUTH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_HITR", + ("HIPRAND_HITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ("curand_M1", ("HIPRAND_M1", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)), + ("curand_M2", ("HIPRAND_M2", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)), + ( + "curand_BINARY_SEARCH", + ("HIPRAND_BINARY_SEARCH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_DISCRETE_GAUSS", + ("HIPRAND_DISCRETE_GAUSS", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_REJECTION", + ("HIPRAND_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_DEVICE_API", + ("HIPRAND_DEVICE_API", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_FAST_REJECTION", + ("HIPRAND_FAST_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_3RD", + ("HIPRAND_3RD", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_DEFINITION", + ("HIPRAND_DEFINITION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_POISSON", + ("HIPRAND_POISSON", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED), + ), + ("curandCreateGenerator", ("hiprandCreateGenerator", CONV_MATH_FUNC, API_RAND)), + ( + "curandCreateGeneratorHost", + ("hiprandCreateGeneratorHost", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandCreatePoissonDistribution", + ("hiprandCreatePoissonDistribution", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandDestroyDistribution", + ("hiprandDestroyDistribution", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandDestroyGenerator", + ("hiprandDestroyGenerator", CONV_MATH_FUNC, API_RAND), + ), + ("curandGenerate", ("hiprandGenerate", CONV_MATH_FUNC, API_RAND)), + ( + "curandGenerateLogNormal", + ("hiprandGenerateLogNormal", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandGenerateLogNormalDouble", + ("hiprandGenerateLogNormalDouble", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandGenerateLongLong", + ("hiprandGenerateLongLong", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ("curandGenerateNormal", ("hiprandGenerateNormal", CONV_MATH_FUNC, API_RAND)), + ( + "curandGenerateNormalDouble", + ("hiprandGenerateNormalDouble", CONV_MATH_FUNC, API_RAND), + ), + ("curandGeneratePoisson", ("hiprandGeneratePoisson", CONV_MATH_FUNC, API_RAND)), + ("curandGenerateSeeds", ("hiprandGenerateSeeds", CONV_MATH_FUNC, API_RAND)), + ("curandGenerateUniform", ("hiprandGenerateUniform", CONV_MATH_FUNC, API_RAND)), + ( + "curandGenerateUniformDouble", + ("hiprandGenerateUniformDouble", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandGetDirectionVectors32", + ("hiprandGetDirectionVectors32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandGetDirectionVectors64", + ("hiprandGetDirectionVectors64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandGetProperty", + ("hiprandGetProperty", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandGetScrambleConstants32", + ( + "hiprandGetScrambleConstants32", + CONV_MATH_FUNC, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curandGetScrambleConstants64", + ( + "hiprandGetScrambleConstants64", + CONV_MATH_FUNC, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ("curandGetVersion", ("hiprandGetVersion", CONV_MATH_FUNC, API_RAND)), + ( + "curandSetGeneratorOffset", + ("hiprandSetGeneratorOffset", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandSetGeneratorOrdering", + ("hiprandSetGeneratorOrdering", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curandSetPseudoRandomGeneratorSeed", + ("hiprandSetPseudoRandomGeneratorSeed", CONV_MATH_FUNC, API_RAND), + ), + ( + "curandSetQuasiRandomGeneratorDimensions", + ("hiprandSetQuasiRandomGeneratorDimensions", CONV_MATH_FUNC, API_RAND), + ), + ("curandSetStream", ("hiprandSetStream", CONV_MATH_FUNC, API_RAND)), + ("curand", ("hiprand", CONV_DEVICE_FUNC, API_RAND)), + ("curand4", ("hiprand4", CONV_DEVICE_FUNC, API_RAND)), + ("curand_init", ("hiprand_init", CONV_DEVICE_FUNC, API_RAND)), + ("curand_log_normal", ("hiprand_log_normal", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_log_normal_double", + ("hiprand_log_normal_double", CONV_DEVICE_FUNC, API_RAND), + ), + ("curand_log_normal2", ("hiprand_log_normal2", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_log_normal2_double", + ("hiprand_log_normal2_double", CONV_DEVICE_FUNC, API_RAND), + ), + ("curand_log_normal4", ("hiprand_log_normal4", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_log_normal4_double", + ("hiprand_log_normal4_double", CONV_DEVICE_FUNC, API_RAND), + ), + ( + "curand_mtgp32_single", + ("hiprand_mtgp32_single", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ( + "curand_mtgp32_single_specific", + ( + "hiprand_mtgp32_single_specific", + CONV_DEVICE_FUNC, + API_RAND, + HIP_UNSUPPORTED, + ), + ), + ( + "curand_mtgp32_specific", + ("hiprand_mtgp32_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ("curand_normal", ("hiprand_normal", CONV_DEVICE_FUNC, API_RAND)), + ( + "curandMakeMTGP32Constants", + ("hiprandMakeMTGP32Constants", CONV_DEVICE_FUNC, API_RAND), + ), + ( + "curandMakeMTGP32KernelState", + ("hiprandMakeMTGP32KernelState", CONV_DEVICE_FUNC, API_RAND), + ), + ("curand_normal_double", ("hiprand_normal_double", CONV_DEVICE_FUNC, API_RAND)), + ("curand_normal2", ("hiprand_normal2", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_normal2_double", + ("hiprand_normal2_double", CONV_DEVICE_FUNC, API_RAND), + ), + ("curand_normal4", ("hiprand_normal4", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_normal4_double", + ("hiprand_normal4_double", CONV_DEVICE_FUNC, API_RAND), + ), + ("curand_uniform", ("hiprand_uniform", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_uniform_double", + ("hiprand_uniform_double", CONV_DEVICE_FUNC, API_RAND), + ), + ( + "curand_uniform2_double", + ("hiprand_uniform2_double", CONV_DEVICE_FUNC, API_RAND), + ), + ("curand_uniform4", ("hiprand_uniform4", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_uniform4_double", + ("hiprand_uniform4_double", CONV_DEVICE_FUNC, API_RAND), + ), + ("curand_discrete", ("hiprand_discrete", CONV_DEVICE_FUNC, API_RAND)), + ("curand_discrete4", ("hiprand_discrete4", CONV_DEVICE_FUNC, API_RAND)), + ("curand_poisson", ("hiprand_poisson", CONV_DEVICE_FUNC, API_RAND)), + ("curand_poisson4", ("hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND)), + ( + "curand_Philox4x32_10", + ("hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED), + ), + ("mtgp32_kernel_params", ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND)), + ("CUFFT_FORWARD", ("HIPFFT_FORWARD", CONV_NUMERIC_LITERAL, API_BLAS)), + ("CUFFT_INVERSE", ("HIPFFT_BACKWARD", CONV_NUMERIC_LITERAL, API_BLAS)), + ( + "CUFFT_COMPATIBILITY_DEFAULT", + ( + "HIPFFT_COMPATIBILITY_DEFAULT", + CONV_NUMERIC_LITERAL, + API_BLAS, + HIP_UNSUPPORTED, + ), + ), + ("cuComplex", ("hipComplex", CONV_TYPE, API_BLAS)), + ("cuDoubleComplex", ("hipDoubleComplex", CONV_TYPE, API_BLAS)), + ("cufftResult_t", ("hipfftResult_t", CONV_TYPE, API_FFT)), + ("cufftResult", ("hipfftResult", CONV_TYPE, API_FFT)), + ("CUFFT_SUCCESS", ("HIPFFT_SUCCESS", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_INVALID_PLAN", ("HIPFFT_INVALID_PLAN", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_ALLOC_FAILED", ("HIPFFT_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_INVALID_TYPE", ("HIPFFT_INVALID_TYPE", CONV_NUMERIC_LITERAL, API_FFT)), + ( + "CUFFT_INVALID_VALUE", + ("HIPFFT_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_FFT), + ), + ( + "CUFFT_INTERNAL_ERROR", + ("HIPFFT_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_FFT), + ), + ("CUFFT_EXEC_FAILED", ("HIPFFT_EXEC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_SETUP_FAILED", ("HIPFFT_SETUP_FAILED", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_INVALID_SIZE", ("HIPFFT_INVALID_SIZE", CONV_NUMERIC_LITERAL, API_FFT)), + ( + "CUFFT_UNALIGNED_DATA", + ("HIPFFT_UNALIGNED_DATA", CONV_NUMERIC_LITERAL, API_FFT), + ), + ( + "CUFFT_INCOMPLETE_PARAMETER_LIST", + ("HIPFFT_INCOMPLETE_PARAMETER_LIST", CONV_NUMERIC_LITERAL, API_FFT), + ), + ( + "CUFFT_INVALID_DEVICE", + ("HIPFFT_INVALID_DEVICE", CONV_NUMERIC_LITERAL, API_FFT), + ), + ("CUFFT_PARSE_ERROR", ("HIPFFT_PARSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_NO_WORKSPACE", ("HIPFFT_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_FFT)), + ( + "CUFFT_NOT_IMPLEMENTED", + ("HIPFFT_NOT_IMPLEMENTED", CONV_NUMERIC_LITERAL, API_FFT), + ), + ( + "CUFFT_LICENSE_ERROR", + ("HIPFFT_LICENSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED), + ), + ( + "CUFFT_NOT_SUPPORTED", + ("HIPFFT_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_FFT), + ), + ("cufftType_t", ("hipfftType_t", CONV_TYPE, API_FFT)), + ("cufftType", ("hipfftType", CONV_TYPE, API_FFT)), + ("CUFFT_R2C", ("HIPFFT_R2C", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_C2R", ("HIPFFT_C2R", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_C2C", ("HIPFFT_C2C", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_D2Z", ("HIPFFT_D2Z", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_Z2D", ("HIPFFT_Z2D", CONV_NUMERIC_LITERAL, API_FFT)), + ("CUFFT_Z2Z", ("HIPFFT_Z2Z", CONV_NUMERIC_LITERAL, API_FFT)), + ( + "cufftCompatibility_t", + ("hipfftCompatibility_t", CONV_TYPE, API_FFT, HIP_UNSUPPORTED), + ), + ( + "cufftCompatibility", + ("hipfftCompatibility", CONV_TYPE, API_FFT, HIP_UNSUPPORTED), + ), + ( + "CUFFT_COMPATIBILITY_FFTW_PADDING", + ( + "HIPFFT_COMPATIBILITY_FFTW_PADDING", + CONV_NUMERIC_LITERAL, + API_FFT, + HIP_UNSUPPORTED, + ), + ), + ("cufftReal", ("hipfftReal", CONV_TYPE, API_FFT)), + ("cufftDoubleReal", ("hipfftDoubleReal", CONV_TYPE, API_FFT)), + ("cufftComplex", ("hipfftComplex", CONV_TYPE, API_FFT)), + ("cufftDoubleComplex", ("hipfftDoubleComplex", CONV_TYPE, API_FFT)), + ("cufftHandle", ("hipfftHandle", CONV_TYPE, API_FFT)), + ("cufftPlan1d", ("hipfftPlan1d", CONV_MATH_FUNC, API_FFT)), + ("cufftPlan2d", ("hipfftPlan2d", CONV_MATH_FUNC, API_FFT)), + ("cufftPlan3d", ("hipfftPlan3d", CONV_MATH_FUNC, API_FFT)), + ("cufftPlanMany", ("hipfftPlanMany", CONV_MATH_FUNC, API_FFT)), + ("cufftMakePlan1d", ("hipfftMakePlan1d", CONV_MATH_FUNC, API_FFT)), + ("cufftMakePlan2d", ("hipfftMakePlan2d", CONV_MATH_FUNC, API_FFT)), + ("cufftMakePlan3d", ("hipfftMakePlan3d", CONV_MATH_FUNC, API_FFT)), + ("cufftMakePlanMany", ("hipfftMakePlanMany", CONV_MATH_FUNC, API_FFT)), + ("cufftMakePlanMany64", ("hipfftMakePlanMany64", CONV_MATH_FUNC, API_FFT)), + ("cufftGetSizeMany64", ("hipfftGetSizeMany64", CONV_MATH_FUNC, API_FFT)), + ("cufftEstimate1d", ("hipfftEstimate1d", CONV_MATH_FUNC, API_FFT)), + ("cufftEstimate2d", ("hipfftEstimate2d", CONV_MATH_FUNC, API_FFT)), + ("cufftEstimate3d", ("hipfftEstimate3d", CONV_MATH_FUNC, API_FFT)), + ("cufftEstimateMany", ("hipfftEstimateMany", CONV_MATH_FUNC, API_FFT)), + ("cufftCreate", ("hipfftCreate", CONV_MATH_FUNC, API_FFT)), + ("cufftGetSize1d", ("hipfftGetSize1d", CONV_MATH_FUNC, API_FFT)), + ("cufftGetSize2d", ("hipfftGetSize2d", CONV_MATH_FUNC, API_FFT)), + ("cufftGetSize3d", ("hipfftGetSize3d", CONV_MATH_FUNC, API_FFT)), + ("cufftGetSizeMany", ("hipfftGetSizeMany", CONV_MATH_FUNC, API_FFT)), + ("cufftGetSize", ("hipfftGetSize", CONV_MATH_FUNC, API_FFT)), + ("cufftSetWorkArea", ("hipfftSetWorkArea", CONV_MATH_FUNC, API_FFT)), + ( + "cufftSetAutoAllocation", + ("hipfftSetAutoAllocation", CONV_MATH_FUNC, API_FFT), + ), + ("cufftXtExec", ("hipfftXtExec", CONV_MATH_FUNC, API_FFT)), + ("cufftXtMakePlanMany", ("hipfftXtMakePlanMany", CONV_MATH_FUNC, API_FFT)), + ("cufftExecC2C", ("hipfftExecC2C", CONV_MATH_FUNC, API_FFT)), + ("cufftExecR2C", ("hipfftExecR2C", CONV_MATH_FUNC, API_FFT)), + ("cufftExecC2R", ("hipfftExecC2R", CONV_MATH_FUNC, API_FFT)), + ("cufftExecZ2Z", ("hipfftExecZ2Z", CONV_MATH_FUNC, API_FFT)), + ("cufftExecD2Z", ("hipfftExecD2Z", CONV_MATH_FUNC, API_FFT)), + ("cufftExecZ2D", ("hipfftExecZ2D", CONV_MATH_FUNC, API_FFT)), + ("cufftSetStream", ("hipfftSetStream", CONV_MATH_FUNC, API_FFT)), + ("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)), + ("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)), + ( + "cufftGetProperty", + ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED), + ), + ("nvrtcResult", ("hiprtcResult", CONV_TYPE, API_RTC)), + ("NVRTC_SUCCESS", ("HIPRTC_SUCCESS", CONV_TYPE, API_RTC)), + ( + "NVRTC_ERROR_OUT_OF_MEMORY", + ("HIPRTC_ERROR_OUT_OF_MEMORY", CONV_TYPE, API_RTC), + ), + ( + "NVRTC_ERROR_PROGRAM_CREATION_FAILURE", + ("HIPRTC_ERROR_PROGRAM_CREATION_FAILURE", CONV_TYPE, API_RTC), + ), + ( + "NVRTC_ERROR_INVALID_INPUT", + ("HIPRTC_ERROR_INVALID_INPUT", CONV_TYPE, API_RTC), + ), + ( + "NVRTC_ERROR_INVALID_PROGRAM", + ("HIPRTC_ERROR_INVALID_PROGRAM", CONV_TYPE, API_RTC), + ), + ("NVRTC_ERROR_COMPILATION", ("HIPRTC_ERROR_COMPILATION", CONV_TYPE, API_RTC)), + ( + "NVRTC_ERROR_BUILTIN_OPERATION_FAILURE", + ("HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE", CONV_TYPE, API_RTC), + ), + ( + "NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", + ("HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", CONV_TYPE, API_RTC), + ), + ( + "NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID", + ("HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID", CONV_TYPE, API_RTC), + ), + ( + "NVRTC_ERROR_INTERNAL_ERROR", + ("HIPRTC_ERROR_INTERNAL_ERROR", CONV_TYPE, API_RTC), + ), + ("nvrtcGetErrorString", ("hiprtcGetErrorString", CONV_JIT, API_RTC)), + ("nvrtcVersion", ("hiprtcVersion", CONV_JIT, API_RTC)), + ("nvrtcProgram", ("hiprtcProgram", CONV_TYPE, API_RTC)), + ("nvrtcAddNameExpression", ("hiprtcAddNameExpression", CONV_JIT, API_RTC)), + ("nvrtcCompileProgram", ("hiprtcCompileProgram", CONV_JIT, API_RTC)), + ("nvrtcCreateProgram", ("hiprtcCreateProgram", CONV_JIT, API_RTC)), + ("nvrtcDestroyProgram", ("hiprtcDestroyProgram", CONV_JIT, API_RTC)), + ("nvrtcGetLoweredName", ("hiprtcGetLoweredName", CONV_JIT, API_RTC)), + ("nvrtcGetProgramLog", ("hiprtcGetProgramLog", CONV_JIT, API_RTC)), + ("nvrtcGetProgramLogSize", ("hiprtcGetProgramLogSize", CONV_JIT, API_RTC)), + ("nvrtcGetPTX", ("hiprtcGetCode", CONV_JIT, API_RTC)), + ("nvrtcGetPTXSize", ("hiprtcGetCodeSize", CONV_JIT, API_RTC)), + ("thrust::cuda", ("thrust::hip", CONV_MATH_FUNC, API_BLAS)), + ( + "cudaCpuDeviceId", + ("hipCpuDeviceId", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED), + ), + # The caffe2 directory does a string match; pytorch does a word-boundary match. + # Patterns such as 'cub::' will not match for pytorch. + # We list all current uses of cub symbols for this reason. + ("cub::", ("hipcub::", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::ArgMax", ("hipcub::ArgMax", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::ArgMin", ("hipcub::ArgMin", CONV_SPECIAL_FUNC, API_RUNTIME)), + ( + "cub::BLOCK_SCAN_WARP_SCANS", + ("hipcub::BLOCK_SCAN_WARP_SCANS", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::BLOCK_REDUCE_WARP_REDUCTIONS", + ("hipcub::BLOCK_REDUCE_WARP_REDUCTIONS", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::BLOCK_STORE_WARP_TRANSPOSE", + ("hipcub::BLOCK_STORE_WARP_TRANSPOSE", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::BLOCK_LOAD_DIRECT", + ("hipcub::BLOCK_LOAD_DIRECT", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::BLOCK_STORE_DIRECT", + ("hipcub::BLOCK_STORE_DIRECT", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ("cub::BlockReduce", ("hipcub::BlockReduce", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::BlockScan", ("hipcub::BlockScan", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::BlockLoad", ("hipcub::BlockLoad", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::BlockStore", ("hipcub::BlockStore", CONV_SPECIAL_FUNC, API_RUNTIME)), + ( + "cub::BlockRakingLayout", + ("hipcub::BlockRakingLayout", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::Uninitialized", + ("hipcub::Uninitialized", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ("cub::RowMajorTid", ("hipcub::RowMajorTid", CONV_SPECIAL_FUNC, API_RUNTIME)), + ( + "cub::CachingDeviceAllocator", + ("hipcub::CachingDeviceAllocator", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::CountingInputIterator", + ("hipcub::CountingInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::DeviceRadixSort", + ("hipcub::DeviceRadixSort", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ("cub::DeviceReduce", ("hipcub::DeviceReduce", CONV_SPECIAL_FUNC, API_RUNTIME)), + ( + "cub::DeviceRunLengthEncode", + ("hipcub::DeviceRunLengthEncode", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ("cub::DeviceScan", ("hipcub::DeviceScan", CONV_SPECIAL_FUNC, API_RUNTIME)), + ( + "cub::DeviceSegmentedRadixSort", + ("hipcub::DeviceSegmentedRadixSort", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::DeviceSegmentedReduce", + ("hipcub::DeviceSegmentedReduce", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ("cub::DeviceSelect", ("hipcub::DeviceSelect", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::KeyValuePair", ("hipcub::KeyValuePair", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::Max", ("hipcub::Max", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::Min", ("hipcub::Min", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::Sum", ("hipcub::Sum", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::Log2", ("hipcub::Log2", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::LaneId", ("hipcub::LaneId", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::WarpMask", ("hipcub::WarpMask", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::ShuffleIndex", ("hipcub::ShuffleIndex", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::ShuffleDown", ("hipcub::ShuffleDown", CONV_SPECIAL_FUNC, API_RUNTIME)), + ( + "cub::ArgIndexInputIterator", + ("hipcub::ArgIndexInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ( + "cub::TransformInputIterator", + ("hipcub::TransformInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME), + ), + ("cub::WarpReduce", ("hipcub::WarpReduce", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("cub::CTA_SYNC", ("hipcub::CTA_SYNC", CONV_SPECIAL_FUNC, API_RUNTIME)), + ("nvtxMark", ("roctxMark", CONV_OTHER, API_ROCTX)), + ("nvtxMarkA", ("roctxMarkA", CONV_OTHER, API_ROCTX)), + ("nvtxRangePushA", ("roctxRangePushA", CONV_OTHER, API_ROCTX)), + ("nvtxRangePop", ("roctxRangePop", CONV_OTHER, API_ROCTX)), + ("nvtxRangeStartA", ("roctxRangeStartA", CONV_OTHER, API_ROCTX)), + ("nvtxRangeEnd", ("roctxRangeStop", CONV_OTHER, API_ROCTX)), + ("nvmlReturn_t", ("rsmi_status_t", CONV_OTHER, API_ROCMSMI)), + ("NVML_SUCCESS", ("RSMI_STATUS_SUCCESS", CONV_OTHER, API_ROCMSMI)), + ("NVML_P2P_CAPS_INDEX_READ", ("RSMI_STATUS_SUCCESS", CONV_OTHER, API_ROCMSMI)), + ("NVML_P2P_STATUS_OK", ("RSMI_STATUS_SUCCESS", CONV_OTHER, API_ROCMSMI)), + ( + "NVML_ERROR_INSUFFICIENT_SIZE", + ("RSMI_STATUS_INSUFFICIENT_SIZE", CONV_OTHER, API_ROCMSMI), + ), + ("nvmlDevice_t", ("uint32_t", CONV_OTHER, API_ROCMSMI)), + ("nvmlGpuP2PStatus_t", ("bool", CONV_OTHER, API_ROCMSMI)), + ("nvmlProcessInfo_t", ("rsmi_process_info_t", CONV_OTHER, API_ROCMSMI)), + ("nvmlGpuP2PCapsIndex_t", ("uint32_t", CONV_OTHER, API_ROCMSMI)), + ] +) + +CUDA_SPECIAL_MAP = collections.OrderedDict( + [ + # SPARSE + ("cusparseStatus_t", ("hipsparseStatus_t", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseHandle_t", ("hipsparseHandle_t", CONV_MATH_FUNC, API_SPECIAL)), + ("cuComplex", ("hipComplex", CONV_TYPE, API_SPECIAL)), + ("cuDoubleComplex", ("hipDoubleComplex", CONV_TYPE, API_SPECIAL)), + ( + "CUSPARSE_POINTER_MODE_HOST", + ("HIPSPARSE_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ("cusparseOperation_t", ("hipsparseOperation_t", CONV_TYPE, API_SPECIAL)), + ( + "cusparseCreateMatDescr", + ("hipsparseCreateMatDescr", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseCreate", ("hipsparseCreate", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseDestroyMatDescr", + ("hipsparseDestroyMatDescr", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseDestroy", ("hipsparseDestroy", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseXcoo2csr", ("hipsparseXcoo2csr", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseMatDescr_t", ("hipsparseMatDescr_t", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseDiagType_t", ("hipsparseDiagType_t", CONV_TYPE, API_SPECIAL)), + ( + "CUSPARSE_DIAG_TYPE_UNIT", + ("HIPSPARSE_DIAG_TYPE_UNIT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_DIAG_TYPE_NON_UNIT", + ("HIPSPARSE_DIAG_TYPE_NON_UNIT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "cusparseSetMatDiagType", + ("hipsparseSetMatDiagType", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseFillMode_t", ("hipsparseFillMode_t", CONV_TYPE, API_SPECIAL)), + ( + "CUSPARSE_FILL_MODE_UPPER", + ("HIPSPARSE_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_FILL_MODE_LOWER", + ("HIPSPARSE_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "cusparseSetMatFillMode", + ("hipsparseSetMatFillMode", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseDirection_t", ("hipsparseDirection_t", CONV_TYPE, API_SPECIAL)), + ( + "CUSPARSE_DIRECTION_ROW", + ("HIPSPARSE_DIRECTION_ROW", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_DIRECTION_COLUMN", + ("HIPSPARSE_DIRECTION_COLUMN", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ("cusparseSolvePolicy_t", ("hipsparseSolvePolicy_t", CONV_TYPE, API_SPECIAL)), + ( + "CUSPARSE_SOLVE_POLICY_NO_LEVEL", + ("HIPSPARSE_SOLVE_POLICY_NO_LEVEL", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_SOLVE_POLICY_USE_LEVEL", + ("HIPSPARSE_SOLVE_POLICY_USE_LEVEL", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "cusparseCreateBsrsv2Info", + ("hipsparseCreateBsrsv2Info", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCreateBsrsm2Info", + ("hipsparseCreateBsrsm2Info", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDestroyBsrsv2Info", + ("hipsparseDestroyBsrsv2Info", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDestroyBsrsm2Info", + ("hipsparseDestroyBsrsm2Info", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseSbsrmm", ("hipsparseSbsrmm", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseDbsrmm", ("hipsparseDbsrmm", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseCbsrmm", ("hipsparseCbsrmm", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseZbsrmm", ("hipsparseZbsrmm", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseSbsrmv", ("hipsparseSbsrmv", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseDbsrmv", ("hipsparseDbsrmv", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseCbsrmv", ("hipsparseCbsrmv", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseZbsrmv", ("hipsparseZbsrmv", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseSbsrsv2_bufferSize", + ("hipsparseSbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDbsrsv2_bufferSize", + ("hipsparseDbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCbsrsv2_bufferSize", + ("hipsparseCbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseZbsrsv2_bufferSize", + ("hipsparseZbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSbsrsv2_analysis", + ("hipsparseSbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDbsrsv2_analysis", + ("hipsparseDbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCbsrsv2_analysis", + ("hipsparseCbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseZbsrsv2_analysis", + ("hipsparseZbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSbsrsv2_solve", + ("hipsparseSbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDbsrsv2_solve", + ("hipsparseDbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCbsrsv2_solve", + ("hipsparseCbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseZbsrsv2_solve", + ("hipsparseZbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSbsrsm2_bufferSize", + ("hipsparseSbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDbsrsm2_bufferSize", + ("hipsparseDbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCbsrsm2_bufferSize", + ("hipsparseCbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseZbsrsm2_bufferSize", + ("hipsparseZbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSbsrsm2_analysis", + ("hipsparseSbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDbsrsm2_analysis", + ("hipsparseDbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCbsrsm2_analysis", + ("hipsparseCbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseZbsrsm2_analysis", + ("hipsparseZbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSbsrsm2_solve", + ("hipsparseSbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDbsrsm2_solve", + ("hipsparseDbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCbsrsm2_solve", + ("hipsparseCbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseZbsrsm2_solve", + ("hipsparseZbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseScsrmm2", ("hipsparseScsrmm2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseDcsrmm2", ("hipsparseDcsrmm2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseCcsrmm2", ("hipsparseCcsrmm2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseZcsrmm2", ("hipsparseZcsrmm2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseScsrmm", ("hipsparseScsrmm", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseDcsrmm", ("hipsparseDcsrmm", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseXcsrsort_bufferSizeExt", + ("hipsparseXcsrsort_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCreateCsrgemm2Info", + ("hipsparseCreateCsrgemm2Info", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDestroyCsrgemm2Info", + ("hipsparseDestroyCsrgemm2Info", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseXcsrgemm2Nnz", + ("hipsparseXcsrgemm2Nnz", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDcsrgemm2_bufferSizeExt", + ("hipsparseDcsrgemm2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseScsrgemm2_bufferSizeExt", + ("hipsparseScsrgemm2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseDcsrgemm2", ("hipsparseDcsrgemm2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseScsrgemm2", ("hipsparseScsrgemm2", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseSetPointerMode", + ("hipsparseSetPointerMode", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseXcsrgeam2Nnz", + ("hipsparseXcsrgeam2Nnz", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseScsrgeam2_bufferSizeExt", + ("hipsparseScsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDcsrgeam2_bufferSizeExt", + ("hipsparseDcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCcsrgeam2_bufferSizeExt", + ("hipsparseCcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseZcsrgeam2_bufferSizeExt", + ("hipsparseZcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseScsrgeam2", ("hipsparseScsrgeam2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseDcsrgeam2", ("hipsparseDcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseCcsrgeam2", ("hipsparseCcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseZcsrgeam2", ("hipsparseZcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseXcsrsort", ("hipsparseXcsrsort", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseXbsrsm2_zeroPivot", + ("hipsparseXbsrsm2_zeroPivot", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseXbsrsv2_zeroPivot", + ("hipsparseXbsrsv2_zeroPivot", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseXcoosort_bufferSizeExt", + ("hipsparseXcoosort_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseXcoosortByRow", + ("hipsparseXcoosortByRow", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseSetStream", ("hipsparseSetStream", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseCreateIdentityPermutation", + ("hipsparseCreateIdentityPermutation", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSetMatIndexBase", + ("hipsparseSetMatIndexBase", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseSetMatType", ("hipsparseSetMatType", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseSpMV", ("hipsparseSpMV", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseSpMV_bufferSize", + ("hipsparseSpMV_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseSpMM", ("hipsparseSpMM", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseSpMM_bufferSize", + ("hipsparseSpMM_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseCreateDnMat", ("hipsparseCreateDnMat", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseDnMatSetStridedBatch", + ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCsrSetStridedBatch", + ("hipsparseCsrSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseCreateDnVec", ("hipsparseCreateDnVec", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseCreateCsr", ("hipsparseCreateCsr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseDestroyDnMat", + ("hipsparseDestroyDnMat", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDestroyDnVec", + ("hipsparseDestroyDnVec", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDestroySpMat", + ("hipsparseDestroySpMat", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSpGEMM_destroyDescr", + ("hipsparseSpGEMM_destroyDescr", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseCreateCoo", ("hipsparseCreateCoo", CONV_MATH_FUNC, API_SPECIAL)), + ("cusparseCreateCsr", ("hipsparseCreateCsr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseSpGEMM_createDescr", + ("hipsparseSpGEMM_createDescr", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseDnMatSetStridedBatch", + ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseSpGEMM_copy", ("hipsparseSpGEMM_copy", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseSDDMM_bufferSize", + ("hipsparseSDDMM_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSDDMM_preprocess", + ("hipsparseSDDMM_preprocess", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseSDDMM", ("hipsparseSDDMM", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusparseSpGEMM_compute", + ("hipsparseSpGEMM_compute", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSpGEMM_workEstimation", + ("hipsparseSpGEMM_workEstimation", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseSpMatGetSize", + ("hipsparseSpMatGetSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusparseCsrSetPointers", + ("hipsparseCsrSetPointers", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusparseSpMVAlg_t", ("hipsparseSpMVAlg_t", CONV_TYPE, API_SPECIAL)), + ("cusparseSpMMAlg_t", ("hipsparseSpMMAlg_t", CONV_TYPE, API_SPECIAL)), + ("cusparseIndexType_t", ("hipsparseIndexType_t", CONV_TYPE, API_SPECIAL)), + # Unsupported ("cusparseMatDescr", ("hipsparseMatDescr", CONV_TYPE, API_SPECIAL)), + # Unsupported ("cusparseDnMatDescr", ("hipsparseDnMatDescr", CONV_TYPE, API_SPECIAL)), + # Unsupported ("cusparseDnVecDescr", ("hipsparseDnVecDescr", CONV_TYPE, API_SPECIAL)), + # Unsupported ("cusparseSpMatDescr", ("hipsparseSpMatDescr", CONV_TYPE, API_SPECIAL)), + # Unsupported ("cusparseSpGEMMDescr", ("hipsparseSpGEMMDescr", CONV_TYPE, API_SPECIAL)), + ("cusparseDnMatDescr_t", ("hipsparseDnMatDescr_t", CONV_TYPE, API_SPECIAL)), + ("cusparseDnVecDescr_t", ("hipsparseDnVecDescr_t", CONV_TYPE, API_SPECIAL)), + ("cusparseSpMatDescr_t", ("hipsparseSpMatDescr_t", CONV_TYPE, API_SPECIAL)), + ("cusparseSpGEMMDescr_t", ("hipsparseSpGEMMDescr_t", CONV_TYPE, API_SPECIAL)), + ( + "CUSPARSE_INDEX_32I", + ("HIPSPARSE_INDEX_32I", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_INDEX_64I", + ("HIPSPARSE_INDEX_64I", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_ORDER_COL", + ("HIPSPARSE_ORDER_COLUMN", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_MV_ALG_DEFAULT", + ("HIPSPARSE_MV_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_MM_ALG_DEFAULT", + ("HIPSPARSE_MM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_SPMM_COO_ALG1", + ("HIPSPARSE_SPMM_COO_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_SPMM_COO_ALG2", + ("HIPSPARSE_SPMM_COO_ALG2", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_COOMV_ALG", + ("HIPSPARSE_COOMV_ALG", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_SPMM_CSR_ALG1", + ("HIPSPARSE_CSRMM_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_SPGEMM_DEFAULT", + ("HIPSPARSE_SPGEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_SDDMM_ALG_DEFAULT", + ("HIPSPARSE_SDDMM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_SUCCESS", + ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_NOT_INITIALIZED", + ("HIPSPARSE_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_ALLOC_FAILED", + ("HIPSPARSE_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_INVALID_VALUE", + ("HIPSPARSE_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_MAPPING_ERROR", + ("HIPSPARSE_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_EXECUTION_FAILED", + ("HIPSPARSE_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_INTERNAL_ERROR", + ("HIPSPARSE_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED", + ( + "HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED", + CONV_NUMERIC_LITERAL, + API_SPECIAL, + ), + ), + ( + "CUSPARSE_STATUS_ARCH_MISMATCH", + ("HIPSPARSE_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_STATUS_ZERO_PIVOT", + ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_OPERATION_TRANSPOSE", + ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_OPERATION_NON_TRANSPOSE", + ("HIPSPARSE_OPERATION_NON_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE", + ( + "HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE", + CONV_NUMERIC_LITERAL, + API_SPECIAL, + ), + ), + ( + "CUSPARSE_INDEX_BASE_ZERO", + ("HIPSPARSE_INDEX_BASE_ZERO", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_INDEX_BASE_ONE", + ("HIPSPARSE_INDEX_BASE_ONE", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSPARSE_MATRIX_TYPE_GENERAL", + ("HIPSPARSE_MATRIX_TYPE_GENERAL", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + # SOLVER + ("cublasOperation_t", ("hipsolverOperation_t", CONV_TYPE, API_SPECIAL)), + ("CUBLAS_OP_N", ("HIPSOLVER_OP_N", CONV_NUMERIC_LITERAL, API_SPECIAL)), + ( + "CUBLAS_OP_T", + ("HIPSOLVER_OP_T", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUBLAS_OP_C", + ("HIPSOLVER_OP_C", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ("cublasFillMode_t", ("hipsolverFillMode_t", CONV_TYPE, API_SPECIAL)), + ( + "CUBLAS_FILL_MODE_LOWER", + ("HIPSOLVER_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUBLAS_FILL_MODE_UPPER", + ("HIPSOLVER_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ("cublasSideMode_t", ("hipsolverSideMode_t", CONV_TYPE, API_SPECIAL)), + ( + "CUBLAS_SIDE_LEFT", + ("HIPSOLVER_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUBLAS_SIDE_RIGHT", + ("HIPSOLVER_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ("cusolverEigMode_t", ("hipsolverEigMode_t", CONV_TYPE, API_SPECIAL)), + ( + "CUSOLVER_EIG_MODE_VECTOR", + ("HIPSOLVER_EIG_MODE_VECTOR", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ( + "CUSOLVER_EIG_MODE_NOVECTOR", + ("HIPSOLVER_EIG_MODE_NOVECTOR", CONV_NUMERIC_LITERAL, API_SPECIAL), + ), + ("syevjInfo_t", ("hipsolverSyevjInfo_t", CONV_TYPE, API_SPECIAL)), + ( + "cusolverDnCreateSyevjInfo", + ("hipsolverDnCreateSyevjInfo", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnXsyevjSetSortEig", + ("hipsolverDnXsyevjSetSortEig", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDestroySyevjInfo", + ("hipsolverDnDestroySyevjInfo", CONV_MATH_FUNC, API_SPECIAL), + ), + ("gesvdjInfo_t", ("hipsolverGesvdjInfo_t", CONV_TYPE, API_SPECIAL)), + ( + "cusolverDnCreateGesvdjInfo", + ("hipsolverDnCreateGesvdjInfo", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnXgesvdjSetSortEig", + ("hipsolverDnXgesvdjSetSortEig", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDestroyGesvdjInfo", + ("hipsolverDnDestroyGesvdjInfo", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnHandle_t", ("hipsolverDnHandle_t", CONV_TYPE, API_SPECIAL)), + ("cusolverDnCreate", ("hipsolverDnCreate", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnSetStream", ("hipsolverDnSetStream", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnDestroy", ("hipsolverDnDestroy", CONV_MATH_FUNC, API_SPECIAL)), + # from aten/src/ATen/native/hip/linalg/HIPSolver.cpp + ("cusolverDnParams_t", ("hipsolverDnParams_t", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnCgeqrf", ("hipsolverDnCgeqrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCgeqrf_bufferSize", + ("hipsolverDnCgeqrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCgesvd", ("hipsolverDnCgesvd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCgesvd_bufferSize", + ("hipsolverDnCgesvd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCgesvdj", ("hipsolverDnCgesvdj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCgesvdjBatched", + ("hipsolverDnCgesvdjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnCgesvdjBatched_bufferSize", + ("hipsolverDnCgesvdjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnCgesvdj_bufferSize", + ("hipsolverDnCgesvdj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCgetrf", ("hipsolverDnCgetrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCgetrf_bufferSize", + ("hipsolverDnCgetrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCgetrs", ("hipsolverDnCgetrs", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnCheevd", ("hipsolverDnCheevd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCheevd_bufferSize", + ("hipsolverDnCheevd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCheevj", ("hipsolverDnCheevj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCheevjBatched", + ("hipsolverDnCheevjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnCheevjBatched_bufferSize", + ("hipsolverDnCheevjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnCheevj_bufferSize", + ("hipsolverDnCheevj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCpotrf", ("hipsolverDnCpotrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCpotrfBatched", + ("hipsolverDnCpotrfBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnCpotrf_bufferSize", + ("hipsolverDnCpotrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCpotrs", ("hipsolverDnCpotrs", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCpotrsBatched", + ("hipsolverDnCpotrsBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCungqr", ("hipsolverDnCungqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCungqr_bufferSize", + ("hipsolverDnCungqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnCunmqr", ("hipsolverDnCunmqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnCunmqr_bufferSize", + ("hipsolverDnCunmqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDgeqrf", ("hipsolverDnDgeqrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDgeqrf_bufferSize", + ("hipsolverDnDgeqrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDgesvd", ("hipsolverDnDgesvd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDgesvd_bufferSize", + ("hipsolverDnDgesvd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDgesvdj", ("hipsolverDnDgesvdj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDgesvdjBatched", + ("hipsolverDnDgesvdjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDgesvdjBatched_bufferSize", + ("hipsolverDnDgesvdjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDgesvdj_bufferSize", + ("hipsolverDnDgesvdj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDgetrf", ("hipsolverDnDgetrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDgetrf_bufferSize", + ("hipsolverDnDgetrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDgetrs", ("hipsolverDnDgetrs", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnDorgqr", ("hipsolverDnDorgqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDorgqr_bufferSize", + ("hipsolverDnDorgqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDormqr", ("hipsolverDnDormqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDormqr_bufferSize", + ("hipsolverDnDormqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDpotrf", ("hipsolverDnDpotrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDpotrfBatched", + ("hipsolverDnDpotrfBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDpotrf_bufferSize", + ("hipsolverDnDpotrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDpotrs", ("hipsolverDnDpotrs", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDpotrsBatched", + ("hipsolverDnDpotrsBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDsyevd", ("hipsolverDnDsyevd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDsyevd_bufferSize", + ("hipsolverDnDsyevd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDsyevj", ("hipsolverDnDsyevj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnDsyevjBatched", + ("hipsolverDnDsyevjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDsyevjBatched_bufferSize", + ("hipsolverDnDsyevjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDsyevj_bufferSize", + ("hipsolverDnDsyevj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSgeqrf", ("hipsolverDnSgeqrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSgeqrf_bufferSize", + ("hipsolverDnSgeqrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSgesvd", ("hipsolverDnSgesvd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSgesvd_bufferSize", + ("hipsolverDnSgesvd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSgesvdj", ("hipsolverDnSgesvdj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSgesvdjBatched", + ("hipsolverDnSgesvdjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnSgesvdjBatched_bufferSize", + ("hipsolverDnSgesvdjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnSgesvdj_bufferSize", + ("hipsolverDnSgesvdj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSgetrf", ("hipsolverDnSgetrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSgetrf_bufferSize", + ("hipsolverDnSgetrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSgetrs", ("hipsolverDnSgetrs", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnSorgqr", ("hipsolverDnSorgqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSorgqr_bufferSize", + ("hipsolverDnSorgqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSormqr", ("hipsolverDnSormqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSormqr_bufferSize", + ("hipsolverDnSormqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSpotrf", ("hipsolverDnSpotrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSpotrfBatched", + ("hipsolverDnSpotrfBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnSpotrf_bufferSize", + ("hipsolverDnSpotrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSpotrs", ("hipsolverDnSpotrs", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSpotrsBatched", + ("hipsolverDnSpotrsBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSsyevd", ("hipsolverDnSsyevd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSsyevd_bufferSize", + ("hipsolverDnSsyevd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnSsyevj", ("hipsolverDnSsyevj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnSsyevjBatched", + ("hipsolverDnSsyevjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnSsyevjBatched_bufferSize", + ("hipsolverDnSsyevjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnSsyevj_bufferSize", + ("hipsolverDnSsyevj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnXgeqrf", ("hipsolverDnXgeqrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnXgeqrf_bufferSize", + ("hipsolverDnXgeqrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnXpotrf", ("hipsolverDnXpotrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnXpotrf_bufferSize", + ("hipsolverDnXpotrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnXpotrs", ("hipsolverDnXpotrs", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnXsyevd", ("hipsolverDnXsyevd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnXsyevd_bufferSize", + ("hipsolverDnXsyevd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZgeqrf", ("hipsolverDnZgeqrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZgeqrf_bufferSize", + ("hipsolverDnZgeqrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZgesvd", ("hipsolverDnZgesvd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZgesvd_bufferSize", + ("hipsolverDnZgesvd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZgesvdj", ("hipsolverDnZgesvdj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZgesvdjBatched", + ("hipsolverDnZgesvdjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnZgesvdjBatched_bufferSize", + ("hipsolverDnZgesvdjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnZgesvdj_bufferSize", + ("hipsolverDnZgesvdj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZgetrf", ("hipsolverDnZgetrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZgetrf_bufferSize", + ("hipsolverDnZgetrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZgetrs", ("hipsolverDnZgetrs", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnZheevd", ("hipsolverDnZheevd", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZheevd_bufferSize", + ("hipsolverDnZheevd_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZheevj", ("hipsolverDnZheevj", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZheevjBatched", + ("hipsolverDnZheevjBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnZheevjBatched_bufferSize", + ("hipsolverDnZheevjBatched_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnZheevj_bufferSize", + ("hipsolverDnZheevj_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZpotrf", ("hipsolverDnZpotrf", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZpotrfBatched", + ("hipsolverDnZpotrfBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnZpotrf_bufferSize", + ("hipsolverDnZpotrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZpotrs", ("hipsolverDnZpotrs", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZpotrsBatched", + ("hipsolverDnZpotrsBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZungqr", ("hipsolverDnZungqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZungqr_bufferSize", + ("hipsolverDnZungqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnZunmqr", ("hipsolverDnZunmqr", CONV_MATH_FUNC, API_SPECIAL)), + ( + "cusolverDnZunmqr_bufferSize", + ("hipsolverDnZunmqr_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + # sytrf + ( + "cusolverDnDsytrf_bufferSize", + ("hipsolverDnDsytrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnSsytrf_bufferSize", + ("hipsolverDnSsytrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnZsytrf_bufferSize", + ("hipsolverDnZsytrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnCsytrf_bufferSize", + ("hipsolverDnCsytrf_bufferSize", CONV_MATH_FUNC, API_SPECIAL), + ), + ("cusolverDnDsytrf", ("hipsolverDnDsytrf", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnSsytrf", ("hipsolverDnSsytrf", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnZsytrf", ("hipsolverDnZsytrf", CONV_MATH_FUNC, API_SPECIAL)), + ("cusolverDnCsytrf", ("hipsolverDnCsytrf", CONV_MATH_FUNC, API_SPECIAL)), + # gesdva strided + ( + "cusolverDnSgesvdaStridedBatched_bufferSize", + ( + "hipsolverDnSgesvdaStridedBatched_bufferSize", + CONV_MATH_FUNC, + API_SPECIAL, + ), + ), + ( + "cusolverDnDgesvdaStridedBatched_bufferSize", + ( + "hipsolverDnDgesvdaStridedBatched_bufferSize", + CONV_MATH_FUNC, + API_SPECIAL, + ), + ), + ( + "cusolverDnCgesvdaStridedBatched_bufferSize", + ( + "hipsolverDnCgesvdaStridedBatched_bufferSize", + CONV_MATH_FUNC, + API_SPECIAL, + ), + ), + ( + "cusolverDnZgesvdaStridedBatched_bufferSize", + ( + "hipsolverDnZgesvdaStridedBatched_bufferSize", + CONV_MATH_FUNC, + API_SPECIAL, + ), + ), + ( + "cusolverDnSgesvdaStridedBatched", + ("hipsolverDnSgesvdaStridedBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnDgesvdaStridedBatched", + ("hipsolverDnDgesvdaStridedBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnCgesvdaStridedBatched", + ("hipsolverDnCgesvdaStridedBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnZgesvdaStridedBatched", + ("hipsolverDnZgesvdaStridedBatched", CONV_MATH_FUNC, API_SPECIAL), + ), + # gesvdj SetXXX + ( + "cusolverDnXgesvdjSetTolerance", + ("hipsolverDnXgesvdjSetTolerance", CONV_MATH_FUNC, API_SPECIAL), + ), + ( + "cusolverDnXgesvdjSetMaxSweeps", + ("hipsolverDnXgesvdjSetMaxSweeps", CONV_MATH_FUNC, API_SPECIAL), + ), + ] +) + +PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict( + [ + ("USE_CUDA", ("USE_ROCM", API_PYTORCH)), + ("CUDA_VERSION", ("TORCH_HIP_VERSION", API_PYTORCH)), + ("cudaHostAllocator", ("hipHostAllocator", API_PYTORCH)), + ("cudaDeviceAllocator", ("hipDeviceAllocator", API_PYTORCH)), + ("define MAX_NUM_BLOCKS 200", ("define MAX_NUM_BLOCKS 64", API_PYTORCH)), + ("cuda::CUDAGuard", ("hip::HIPGuardMasqueradingAsCUDA", API_PYTORCH)), + ("CUDAGuard", ("HIPGuardMasqueradingAsCUDA", API_PYTORCH)), + ( + "cuda::OptionalCUDAGuard", + ("hip::OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH), + ), + ("OptionalCUDAGuard", ("OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH)), + ( + "cuda::CUDAStreamGuard", + ("hip::HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH), + ), + ("CUDAStreamGuard", ("HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH)), + ( + "cuda::OptionalCUDAStreamGuard", + ("hip::OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "OptionalCUDAStreamGuard", + ("OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "cuda::CUDAMultiStreamGuard", + ("hip::HIPMultiStreamGuardMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "CUDAMultiStreamGuard", + ("HIPMultiStreamGuardMasqueradingAsCUDA", API_PYTORCH), + ), + # Only get needs to be transformed this way; all the other ones can go + # straight to the normal versions hip::HIPCachingAllocator + ( + "cuda::CUDACachingAllocator::get", + ("hip::HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH), + ), + ( + "CUDACachingAllocator::get", + ("HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH), + ), + ( + "cuda::CUDACachingAllocator::recordStream", + ( + "hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA", + API_PYTORCH, + ), + ), + ( + "CUDACachingAllocator::recordStream", + ( + "HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA", + API_PYTORCH, + ), + ), + ( + "cuda::CUDAAllocator::recordStream", + ( + "hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA", + API_PYTORCH, + ), + ), + ( + "CUDAAllocator::recordStream", + ( + "HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA", + API_PYTORCH, + ), + ), + ("cuda::CUDAStream", ("hip::HIPStreamMasqueradingAsCUDA", API_PYTORCH)), + ("CUDAStream", ("HIPStreamMasqueradingAsCUDA", API_PYTORCH)), + ( + "cuda::getStreamFromPool", + ("hip::getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH), + ), + ("getStreamFromPool", ("getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH)), + ( + "cuda::getDefaultCUDAStream", + ("hip::getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "cuda::getStreamFromExternal", + ("hip::getStreamFromExternalMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "getStreamFromExternal", + ("getStreamFromExternalMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "cuda::getDefaultCUDAStream", + ("hip::getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "getDefaultCUDAStream", + ("getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "cuda::getCurrentCUDAStream", + ("hip::getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "getCurrentCUDAStream", + ("getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "cuda::setCurrentCUDAStream", + ("hip::setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "setCurrentCUDAStream", + ("setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH), + ), + ( + "ATen/cudnn/Handle.h", + ("ATen/miopen/Handle.h", API_PYTORCH), + ), + # TODO: Undo this special-case; see the header for motivation behind this + # hack. It's VERY important this is only applied to PyTorch HIPify. + ( + "c10/cuda/CUDAGuard.h", + ("ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h", API_PYTORCH), + ), + ( + "c10/cuda/CUDACachingAllocator.h", + ("ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h", API_PYTORCH), + ), + ( + "c10/cuda/CUDAStream.h", + ("ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h", API_PYTORCH), + ), + ("gloo/cuda.h", ("gloo/hip.h", API_PYTORCH)), + ( + "gloo/cuda_allreduce_halving_doubling.h", + ("gloo/hip_allreduce_halving_doubling.h", API_PYTORCH), + ), + ( + "gloo/cuda_allreduce_halving_doubling_pipelined.h", + ("gloo/hip_allreduce_halving_doubling_pipelined.h", API_PYTORCH), + ), + ("gloo/cuda_allreduce_ring.h", ("gloo/hip_allreduce_ring.h", API_PYTORCH)), + ( + "gloo/cuda_broadcast_one_to_all.h", + ("gloo/hip_broadcast_one_to_all.h", API_PYTORCH), + ), + ( + "gloo::CudaAllreduceHalvingDoublingPipelined", + ("gloo::HipAllreduceHalvingDoublingPipelined", API_PYTORCH), + ), + ("gloo::CudaBroadcastOneToAll", ("gloo::HipBroadcastOneToAll", API_PYTORCH)), + ("gloo::CudaHostWorkspace", ("gloo::HipHostWorkspace", API_PYTORCH)), + ("gloo::CudaDeviceWorkspace", ("gloo::HipDeviceWorkspace", API_PYTORCH)), + ("CUDNN_RNN_RELU", ("miopenRNNRELU", API_PYTORCH)), + ("CUDNN_RNN_TANH", ("miopenRNNTANH", API_PYTORCH)), + ("CUDNN_LSTM", ("miopenLSTM", API_PYTORCH)), + ("CUDNN_GRU", ("miopenGRU", API_PYTORCH)), + ("cudnnRNNMode_t", ("miopenRNNMode_t", API_PYTORCH)), + ("magma_queue_create_from_cuda", ("magma_queue_create_from_hip", API_PYTORCH)), + ] +) + +CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict( + [ + ("cuda_stream", ("hip_stream", API_CAFFE2)), + # if the header is a native hip folder (under hip directory), + # there is no need to add a hip path to it; the trie in hipify script + # takes this mapping order to forbid further replacement + ("/hip/", ("/hip/", API_CAFFE2)), + ("/context_gpu", ("/hip/context_gpu", API_CAFFE2)), + ("/common_gpu", ("/hip/common_gpu", API_CAFFE2)), + ("/cuda_nccl_gpu", ("/hip/hip_nccl_gpu", API_CAFFE2)), + ("/mixed_utils", ("/hip/mixed_utils", API_CAFFE2)), + ("/operator_fallback_gpu", ("/hip/operator_fallback_gpu", API_CAFFE2)), + ( + "/spatial_batch_norm_op_impl", + ("/hip/spatial_batch_norm_op_impl", API_CAFFE2), + ), + ( + "/recurrent_network_executor_gpu", + ("/hip/recurrent_network_executor_gpu", API_CAFFE2), + ), + ( + "/generate_proposals_op_util_nms_gpu", + ("/hip/generate_proposals_op_util_nms_gpu", API_CAFFE2), + ), + ("/max_pool_with_index_gpu", ("/hip/max_pool_with_index_gpu", API_CAFFE2)), + ("/THCCachingAllocator_gpu", ("/hip/THCCachingAllocator_gpu", API_CAFFE2)), + ("/top_k_heap_selection", ("/hip/top_k_heap_selection", API_CAFFE2)), + ("/top_k_radix_selection", ("/hip/top_k_radix_selection", API_CAFFE2)), + ("/GpuAtomics", ("/hip/GpuAtomics", API_CAFFE2)), + ("/GpuDefs", ("/hip/GpuDefs", API_CAFFE2)), + ("/GpuScanUtils", ("/hip/GpuScanUtils", API_CAFFE2)), + ("/GpuBitonicSort", ("/hip/GpuBitonicSort", API_CAFFE2)), + ("/math/reduce.cuh", ("/math/hip/reduce.cuh", API_CAFFE2)), + ( + "/sgd/adagrad_fused_op_gpu.cuh", + ("/sgd/hip/adagrad_fused_op_gpu.cuh", API_CAFFE2), + ), + ( + "/operators/segment_reduction_op_gpu.cuh", + ("/operators/hip/segment_reduction_op_gpu.cuh", API_CAFFE2), + ), + ("/gather_op.cuh", ("/hip/gather_op.cuh", API_CAFFE2)), + ("caffe2/core/common_cudnn.h", ("caffe2/core/hip/common_miopen.h", API_CAFFE2)), + ("REGISTER_CUDA_OPERATOR", ("REGISTER_HIP_OPERATOR", API_CAFFE2)), + ("CUDA_1D_KERNEL_LOOP", ("HIP_1D_KERNEL_LOOP", API_CAFFE2)), + ("CUDAContext", ("HIPContext", API_CAFFE2)), + ("CAFFE_CUDA_NUM_THREADS", ("CAFFE_HIP_NUM_THREADS", API_CAFFE2)), + ("HasCudaGPU", ("HasHipGPU", API_CAFFE2)), + ("__expf", ("expf", API_CAFFE2)), + ("CUBLAS_ENFORCE", ("HIPBLAS_ENFORCE", API_CAFFE2)), + ("CUBLAS_CHECK", ("HIPBLAS_CHECK", API_CAFFE2)), + ("cublas_handle", ("hipblas_handle", API_CAFFE2)), + ("CURAND_ENFORCE", ("HIPRAND_ENFORCE", API_CAFFE2)), + ("CURAND_CHECK", ("HIPRAND_CHECK", API_CAFFE2)), + ("curandGenerateUniform", ("hiprandGenerateUniform", API_CAFFE2)), + ("curand_generator", ("hiprand_generator", API_CAFFE2)), + ("CaffeCudaGetDevice", ("CaffeHipGetDevice", API_CAFFE2)), + # do not rename CUDA_KERNEL_ASSERT, lazyInitCUDA in caffe2 sources + # the ordered dict guarantees this pattern will match first, before "CUDA" + ("CUDA_KERNEL_ASSERT", ("CUDA_KERNEL_ASSERT", API_CAFFE2)), + ("lazyInitCUDA", ("lazyInitCUDA", API_CAFFE2)), + ("CUDA_VERSION", ("TORCH_HIP_VERSION", API_CAFFE2)), + ("CUDA", ("HIP", API_CAFFE2)), + ("Cuda", ("Hip", API_CAFFE2)), + ("cuda_", ("hip_", API_CAFFE2)), + ("_cuda", ("_hip", API_CAFFE2)), + ("CUDNN", ("MIOPEN", API_CAFFE2)), + ("CuDNN", ("MIOPEN", API_CAFFE2)), + ("cudnn", ("miopen", API_CAFFE2)), + ("namespace cuda", ("namespace hip", API_CAFFE2)), + ("cuda::CUDAGuard", ("hip::HIPGuard", API_CAFFE2)), + ("cuda::OptionalCUDAGuard", ("hip::OptionalHIPGuard", API_CAFFE2)), + ("cuda::CUDAStreamGuard", ("hip::HIPStreamGuard", API_CAFFE2)), + ("cuda::OptionalCUDAStreamGuard", ("hip::OptionalHIPStreamGuard", API_CAFFE2)), + ("c10/cuda/CUDAGuard.h", ("c10/hip/HIPGuard.h", API_CAFFE2)), + ("gloo/cuda", ("gloo/hip", API_CAFFE2)), + ] +) + +# We must tread very carefully here. Blanket conversions like are done +# in CAFFE2_SPECIFIC_MAPPINGS are not presently supported on PyTorch, +# because a regex for CUDA will also match a filename like CUDAGuard.h, +# but the HIPIFY script doesn't presently move the file and so the substitution +# will be invalid. Instead, we specifically list out every identifier +# and file from c10/cuda which may be used externally, and do substitutions this +# way. +# +# NB: if you want a transformation to ONLY apply to the c10/ directory, +# put it as API_CAFFE2 +C10_MAPPINGS = collections.OrderedDict( + [ + ("CUDA_VERSION", ("TORCH_HIP_VERSION", API_PYTORCH)), + ("CUDA_LAUNCH_BLOCKING=1", ("AMD_SERIALIZE_KERNEL=3", API_C10)), + ("CUDA_LAUNCH_BLOCKING", ("AMD_SERIALIZE_KERNEL", API_C10)), + ("cuda::compat::", ("hip::compat::", API_C10)), + ("c10/cuda/CUDAAlgorithm.h", ("c10/hip/HIPAlgorithm.h", API_C10)), + ("c10/cuda/CUDADeviceAssertion.h", ("c10/hip/HIPDeviceAssertion.h", API_C10)), + ( + "c10/cuda/CUDADeviceAssertionHost.h", + ("c10/hip/HIPDeviceAssertionHost.h", API_C10), + ), + ("c10/cuda/CUDAException.h", ("c10/hip/HIPException.h", API_C10)), + ("c10/cuda/CUDAMacros.h", ("c10/hip/HIPMacros.h", API_C10)), + ("c10/cuda/CUDAMathCompat.h", ("c10/hip/HIPMathCompat.h", API_C10)), + ("c10/cuda/CUDAFunctions.h", ("c10/hip/HIPFunctions.h", API_C10)), + ("c10/cuda/CUDAMiscFunctions.h", ("c10/hip/HIPMiscFunctions.h", API_C10)), + ("c10/cuda/CUDAStream.h", ("c10/hip/HIPStream.h", API_C10)), + ("c10/cuda/CUDAGraphsC10Utils.h", ("c10/hip/HIPGraphsC10Utils.h", API_C10)), + ("c10/cuda/CUDAAllocatorConfig.h", ("c10/hip/HIPAllocatorConfig.h", API_C10)), + ("c10/cuda/CUDACachingAllocator.h", ("c10/hip/HIPCachingAllocator.h", API_C10)), + ("c10/cuda/impl/CUDATest.h", ("c10/hip/impl/HIPTest.h", API_C10)), + ("c10/cuda/impl/CUDAGuardImpl.h", ("c10/hip/impl/HIPGuardImpl.h", API_C10)), + ( + "c10/cuda/impl/cuda_cmake_macros.h", + ("c10/hip/impl/hip_cmake_macros.h", API_C10), + ), + ("C10_CUDA_CHECK", ("C10_HIP_CHECK", API_C10)), + ("C10_CUDA_CHECK_WARN", ("C10_HIP_CHECK_WARN", API_C10)), + ("C10_CUDA_ERROR_HANDLED", ("C10_HIP_ERROR_HANDLED", API_C10)), + ("C10_CUDA_IGNORE_ERROR", ("C10_HIP_IGNORE_ERROR", API_C10)), + ("C10_CUDA_CLEAR_ERROR", ("C10_HIP_CLEAR_ERROR", API_C10)), + ("c10::cuda", ("c10::hip", API_C10)), + ("cuda::CUDAStream", ("hip::HIPStream", API_C10)), + ("CUDAStream", ("HIPStream", API_C10)), + # This substitution is not permissible, because there's another copy of this + # function in torch/cuda.h + # ("cuda::device_count", ("hip::device_count", API_C10)), + ("cuda::current_device", ("hip::current_device", API_C10)), + ("cuda::set_device", ("hip::set_device", API_C10)), + ("cuda::device_synchronize", ("hip::device_synchronize", API_C10)), + ("cuda::getStreamFromPool", ("hip::getStreamFromPool", API_C10)), + ("getStreamFromPool", ("getStreamFromPool", API_C10)), + ("cuda::getDefaultCUDAStream", ("hip::getDefaultHIPStream", API_C10)), + ("getDefaultCUDAStream", ("getDefaultHIPStream", API_C10)), + ("cuda::getCurrentCUDAStream", ("hip::getCurrentHIPStream", API_C10)), + ("getCurrentCUDAStream", ("getCurrentHIPStream", API_C10)), + ("cuda::get_cuda_check_prefix", ("hip::get_cuda_check_prefix", API_C10)), + ("cuda::setCurrentCUDAStream", ("hip::setCurrentHIPStream", API_C10)), + ("setCurrentCUDAStream", ("setCurrentHIPStream", API_C10)), + ("cuda::CUDACachingAllocator", ("hip::HIPCachingAllocator", API_C10)), + ("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)), + ("cuda::CUDAAllocatorConfig", ("hip::HIPAllocatorConfig", API_C10)), + ("CUDAAllocatorConfig", ("HIPAllocatorConfig", API_C10)), + ("pinned_use_cuda_host_register", ("pinned_use_hip_host_register", API_C10)), + ("c10::cuda::CUDAAllocator", ("c10::hip::HIPAllocator", API_C10)), + ("cuda::CUDAAllocator", ("hip::HIPAllocator", API_C10)), + ("CUDAStreamCaptureModeGuard", ("HIPStreamCaptureModeGuard", API_C10)), + ( + "cuda::CUDAStreamCaptureModeGuard", + ("cuda::HIPStreamCaptureModeGuard", API_C10), + ), + ("CUDAAllocator", ("HIPAllocator", API_C10)), + ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10)), + ] +) + +# NB: C10 mappings are more specific than Caffe2 mappings, so run them +# first +CUDA_TO_HIP_MAPPINGS = [ + CUDA_IDENTIFIER_MAP, + CUDA_TYPE_NAME_MAP, + CUDA_INCLUDE_MAP, + CUDA_SPECIAL_MAP, + C10_MAPPINGS, + PYTORCH_SPECIFIC_MAPPINGS, + CAFFE2_SPECIFIC_MAPPINGS, +] diff --git a/aiter/jit/utils/hipify/hipify_python.py b/aiter/jit/utils/hipify/hipify_python.py new file mode 100644 index 0000000000000000000000000000000000000000..6073fdfd170ad61a9d19f44e768d04acdb52ae48 --- /dev/null +++ b/aiter/jit/utils/hipify/hipify_python.py @@ -0,0 +1,1353 @@ +# SPDX-License-Identifier: MIT + #!/usr/bin/env python3 +# mypy: allow-untyped-defs +"""The Python Hipify script. +## + +# Facebook Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +""" +import argparse +import fnmatch +import re +import shutil +import sys +import os + +from . import constants +from .cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS +from .cuda_to_hip_mappings import MATH_TRANSPILATIONS + +from typing import Dict, List, Iterator, Optional +from collections.abc import Mapping, Iterable +from enum import Enum + + +class CurrentState(Enum): + INITIALIZED = 1 + DONE = 2 + + +class HipifyResult: + def __init__(self, current_state, hipified_path): + self.current_state = current_state + self.hipified_path = hipified_path + self.status = "" + + def __str__(self): + return f"HipifyResult:: current_state: {self.current_state}, hipified_path : {self.hipified_path}, status: {self.status}" + + +HipifyFinalResult = Dict[str, HipifyResult] +HIPIFY_C_BREADCRUMB = "// !!! This is a file automatically generated by hipify!!!\n" +HIPIFY_FINAL_RESULT: HipifyFinalResult = {} + +# Hardcode the PyTorch template map +"""This dictionary provides the mapping from PyTorch kernel template types +to their actual types.""" +PYTORCH_TEMPLATE_MAP = {"Dtype": "scalar_t", "T": "scalar_t"} + +__all__ = [ + "InputError", + "openf", + "bcolors", + "GeneratedFileCleaner", + "match_extensions", + "matched_files_iter", + "preprocess_file_and_save_result", + "compute_stats", + "add_dim3", + "processKernelLaunches", + "find_closure_group", + "find_bracket_group", + "find_parentheses_group", + "replace_math_functions", + "hip_header_magic", + "replace_extern_shared", + "get_hip_file_path", + "is_out_of_place", + "is_pytorch_file", + "is_cusparse_file", + "is_special_file", + "is_caffe2_gpu_file", + "is_caffe2_gpu_file", + "Trie", + "preprocessor", + "file_specific_replacement", + "file_add_header", + "fix_static_global_kernels", + "extract_arguments", + "str2bool", + "CurrentState", + "HipifyResult", + "hipify", +] + + +class InputError(Exception): + # Exception raised for errors in the input. + + def __init__(self, message): + super().__init__(message) + self.message = message + + def __str__(self): + return f"Input error: {self.message}" + + +def openf(filename, mode): + return open(filename, mode, errors="ignore") + + +# Color coding for printing +class bcolors: + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKGREEN = "\033[92m" + WARNING = "\033[93m" + FAIL = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" + + +# To the programmer, the output of hipify most likely are intermediates. +# This class allows users of hipify to ask for a cleanup by running the +# hipify and compilation in a with instantiating this context manager class +# with keep_intermediates=False. +# The main usecase is the cpp_extensions, specifically the load method. +# It is a good idea to keep intermediates (in case of errors or to +# not recompile unchanged files), but in cases where you don't want to +# keep them (e.g. in the CI), this can be used to remove files. +class GeneratedFileCleaner: + """Context Manager to clean up generated files""" + + def __init__(self, keep_intermediates=False): + self.keep_intermediates = keep_intermediates + self.files_to_clean = set() + self.dirs_to_clean = [] + + def __enter__(self): + return self + + def open(self, fn, *args, **kwargs): + if not os.path.exists(fn): + self.files_to_clean.add(os.path.abspath(fn)) + return open(fn, *args, **kwargs) + + def makedirs(self, dn, exist_ok=False): + parent, n = os.path.split(dn) + if not n: + parent, n = os.path.split(parent) + if parent and n and not os.path.exists(parent): + self.makedirs(parent, exist_ok=True) + if not os.path.isdir(dn) or not exist_ok: + os.mkdir(dn) + self.dirs_to_clean.append(os.path.abspath(dn)) + + def __exit__(self, type, value, traceback): + if not self.keep_intermediates: + for f in self.files_to_clean: + os.unlink(f) + for d in self.dirs_to_clean[::-1]: + os.rmdir(d) + + +def match_extensions(filename: str, extensions: Iterable) -> bool: + """Helper method to see if filename ends with certain extension""" + return any(filename.endswith(e) for e in extensions) + + +def _fnmatch(filepath, patterns): + return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns) + + +def matched_files_iter( + root_path: str, + includes: Iterable = (), + ignores: Iterable = (), + extensions: Iterable = (), + out_of_place_only: bool = False, + is_pytorch_extension: bool = False, +) -> Iterator[str]: + + exact_matches = set(includes) + + # This is a very rough heuristic; really, we want to avoid scanning + # any file which is not checked into source control, but this script + # needs to work even if you're in a Git or Hg checkout, so easier to + # just block the biggest time sinks that won't matter in the + # end. + for abs_dirpath, dirs, filenames in os.walk(root_path, topdown=True): + rel_dirpath = os.path.relpath(abs_dirpath, root_path) + if rel_dirpath == ".": + # Blah blah blah O(n) blah blah + if ".git" in dirs: + dirs.remove(".git") + if "build" in dirs: + dirs.remove("build") + if "third_party" in dirs: + dirs.remove("third_party") + dirs.append("third_party/nvfuser") + for filename in filenames: + filepath = os.path.join(abs_dirpath, filename) + rel_filepath = os.path.join(rel_dirpath, filename) + # We respect extensions, UNLESS you wrote the entire + # filename verbatim, in which case we always accept it + if ( + _fnmatch(filepath, includes) + and (not _fnmatch(filepath, ignores)) + and ( + match_extensions(filepath, extensions) or filepath in exact_matches + ) + ): + if ( + not is_pytorch_extension + ): # for pytorch extensions, consider all files + if not is_pytorch_file(rel_filepath) and not is_caffe2_gpu_file( + rel_filepath + ): + continue + if out_of_place_only and not is_out_of_place(rel_filepath): + continue + yield filepath + + +def preprocess_file_and_save_result( + output_directory: str, + filepath: str, + all_files: Iterable, + header_include_dirs: Iterable, + stats: Dict[str, List], + hip_clang_launch: bool, + is_pytorch_extension: bool, + clean_ctx: GeneratedFileCleaner, + show_progress: bool, +) -> None: + fin_path = os.path.abspath(os.path.join(output_directory, filepath)) + hipify_result = HipifyResult( + current_state=CurrentState.INITIALIZED, hipified_path=fin_path + ) + HIPIFY_FINAL_RESULT[fin_path] = hipify_result + result = preprocessor( + output_directory, + filepath, + all_files, + header_include_dirs, + stats, + hip_clang_launch, + is_pytorch_extension, + clean_ctx, + show_progress, + ) + + # Show what happened + if show_progress and "ignored" not in result.status: + print(fin_path, "->", result.hipified_path, result.status, flush=True) + + HIPIFY_FINAL_RESULT[fin_path] = result + + +def compute_stats(stats): + unsupported_calls = { + cuda_call for (cuda_call, _filepath) in stats["unsupported_calls"] + } + + # Print the number of unsupported calls + print( + f"Total number of unsupported CUDA function calls: {len(unsupported_calls):d}" + ) + + # Print the list of unsupported calls + print(", ".join(unsupported_calls)) + + # Print the number of kernel launches + print( + f"\nTotal number of replaced kernel launches: {len(stats['kernel_launches']):d}" + ) + + +def add_dim3(kernel_string, cuda_kernel): + """adds dim3() to the second and third arguments in the kernel launch""" + count = 0 + closure = 0 + kernel_string = kernel_string.replace("<<<", "").replace(">>>", "") + arg_locs: List[Dict[str, int]] = [{} for _ in range(2)] + arg_locs[count]["start"] = 0 + for ind, c in enumerate(kernel_string): + if count > 1: + break + if c == "(": + closure += 1 + elif c == ")": + closure -= 1 + if (c == "," or ind == len(kernel_string) - 1) and closure == 0: + arg_locs[count]["end"] = ind + (c != ",") + count += 1 + if count < 2: + arg_locs[count]["start"] = ind + 1 + + first_arg_raw = kernel_string[arg_locs[0]["start"] : arg_locs[0]["end"] + 1] + second_arg_raw = kernel_string[arg_locs[1]["start"] : arg_locs[1]["end"]] + + first_arg_clean = ( + kernel_string[arg_locs[0]["start"] : arg_locs[0]["end"]] + .replace("\n", "") + .strip(" ") + ) + second_arg_clean = ( + kernel_string[arg_locs[1]["start"] : arg_locs[1]["end"]] + .replace("\n", "") + .strip(" ") + ) + + first_arg_dim3 = f"dim3({first_arg_clean})" + second_arg_dim3 = f"dim3({second_arg_clean})" + + first_arg_raw_dim3 = first_arg_raw.replace(first_arg_clean, first_arg_dim3) + second_arg_raw_dim3 = second_arg_raw.replace(second_arg_clean, second_arg_dim3) + cuda_kernel = cuda_kernel.replace( + first_arg_raw + second_arg_raw, first_arg_raw_dim3 + second_arg_raw_dim3 + ) + return cuda_kernel + + +RE_KERNEL_LAUNCH = re.compile(r"([ ]+)(detail?)::[ ]+\\\n[ ]+") + + +def processKernelLaunches(string, stats): + """Replace the CUDA style Kernel launches with the HIP style kernel launches.""" + # Concat the namespace with the kernel names. (Find cleaner way of doing this later). + string = RE_KERNEL_LAUNCH.sub(lambda inp: f"{inp.group(1)}{inp.group(2)}::", string) + + def grab_method_and_template(in_kernel): + # The positions for relevant kernel components. + pos = { + "kernel_launch": {"start": in_kernel["start"], "end": in_kernel["end"]}, + "kernel_name": {"start": -1, "end": -1}, + "template": {"start": -1, "end": -1}, + } + + # Count for balancing template + count = {"<>": 0} + + # Status for whether we are parsing a certain item. + START = 0 + AT_TEMPLATE = 1 + AFTER_TEMPLATE = 2 + AT_KERNEL_NAME = 3 + + status = START + + # Parse the string character by character + for i in range(pos["kernel_launch"]["start"] - 1, -1, -1): + char = string[i] + + # Handle Templating Arguments + if status in (START, AT_TEMPLATE): + if char == ">": + if status == START: + status = AT_TEMPLATE + pos["template"]["end"] = i + count["<>"] += 1 + + if char == "<": + count["<>"] -= 1 + if count["<>"] == 0 and (status == AT_TEMPLATE): + pos["template"]["start"] = i + status = AFTER_TEMPLATE + + # Handle Kernel Name + if status != AT_TEMPLATE: + if string[i].isalnum() or string[i] in {"(", ")", "_", ":", "#"}: + if status != AT_KERNEL_NAME: + status = AT_KERNEL_NAME + pos["kernel_name"]["end"] = i + + # Case: Kernel name starts the string. + if i == 0: + pos["kernel_name"]["start"] = 0 + + # Finished + return [ + (pos["kernel_name"]), + (pos["template"]), + (pos["kernel_launch"]), + ] + + else: + # Potential ending point if we're already traversing a kernel's name. + if status == AT_KERNEL_NAME: + pos["kernel_name"]["start"] = i + + # Finished + return [ + (pos["kernel_name"]), + (pos["template"]), + (pos["kernel_launch"]), + ] + + def find_kernel_bounds(string): + """Finds the starting and ending points for all kernel launches in the string.""" + kernel_end = 0 + kernel_positions = [] + + # Continue until we cannot find any more kernels anymore. + while string.find("<<<", kernel_end) != -1: + # Get kernel starting position (starting from the previous ending point) + kernel_start = string.find("<<<", kernel_end) + + # Get kernel ending position (adjust end point past the >>>) + kernel_end = string.find(">>>", kernel_start) + 3 + if kernel_end <= 0: + raise InputError("no kernel end found") + + # Add to list of traversed kernels + kernel_positions.append( + { + "start": kernel_start, + "end": kernel_end, + "group": string[kernel_start:kernel_end], + } + ) + + return kernel_positions + + # Replace comments and string literals from the code so that find_kernel_bounds does not + # wrongly capture kernels in comments and string literals. + # This function replaces them with "x" to keep positions. + def mask_comments(string): + in_comment = "" + prev_c = "" + new_string = "" + for c in string: + if in_comment == "": + # Outside comments + if c == "/" and prev_c == "/": + in_comment = "//" + elif c == "*" and prev_c == "/": + in_comment = "/*" + elif c == '"' and prev_c != "\\" and prev_c != "'": + in_comment = '"' + elif in_comment == "//": + # In // xxx + if c == "\r" or c == "\n": + in_comment = "" + elif in_comment == "/*": + # In /* xxx */ + if c == "/" and prev_c == "*": + in_comment = "" + elif in_comment == '"': + # In "" + if c == '"' and prev_c != "\\": + in_comment = "" + prev_c = c + if in_comment == "": + new_string += c + else: + new_string += "x" + return new_string + + # Grab positional ranges of all kernel launches + get_kernel_positions = list(find_kernel_bounds(mask_comments(string))) + output_string = string + + # Replace each CUDA kernel with a HIP kernel. + for kernel in get_kernel_positions: + # Get kernel components + params = grab_method_and_template(kernel) + + # Find parenthesis after kernel launch + parenthesis = string.find("(", kernel["end"]) + + # Extract cuda kernel + cuda_kernel = string[params[0]["start"] : parenthesis + 1] + kernel_string = string[kernel["start"] : kernel["end"]] + end_param_index = 0 if params[1]["end"] == -1 else 1 + kernel_name_with_template = string[ + params[0]["start"] : params[end_param_index]["end"] + 1 + ] + cuda_kernel_dim3 = add_dim3(kernel_string, cuda_kernel) + # Keep number of kernel launch params consistent (grid dims, group dims, stream, dynamic shared size) + num_klp = len( + extract_arguments( + 0, kernel["group"].replace("<<<", "(").replace(">>>", ")") + ) + ) + + hip_kernel = "hipLaunchKernelGGL(" + cuda_kernel_dim3[0:-1].replace( + ">>>", ", 0" * (4 - num_klp) + ">>>" + ).replace("<<<", ", ").replace(">>>", ", ").replace( + kernel_name_with_template, "(" + kernel_name_with_template + ")" + ) + + # Replace cuda kernel with hip kernel + output_string = output_string.replace(cuda_kernel, hip_kernel) + + # Update the statistics + stats["kernel_launches"].append(hip_kernel) + + return output_string + + +def find_closure_group(input_string, start, group): + """Generalization for finding a balancing closure group + + if group = ["(", ")"], then finds the first balanced parentheses. + if group = ["{", "}"], then finds the first balanced bracket. + + Given an input string, a starting position in the input string, and the group type, + find_closure_group returns the positions of group[0] and group[1] as a tuple. + + Example: + >>> find_closure_group("(hi)", 0, ["(", ")"]) + (0, 3) + """ + + inside_parenthesis = False + parens = 0 + pos = start + p_start, p_end = -1, -1 + + while pos < len(input_string): + if input_string[pos] == group[0]: + if inside_parenthesis is False: + inside_parenthesis = True + parens = 1 + p_start = pos + else: + parens += 1 + elif input_string[pos] == group[1] and inside_parenthesis: + parens -= 1 + + if parens == 0: + p_end = pos + return p_start, p_end + + pos += 1 + return None, None + + +def find_bracket_group(input_string, start): + """Finds the first balanced parantheses.""" + return find_closure_group(input_string, start, group=["{", "}"]) + + +def find_parentheses_group(input_string, start): + """Finds the first balanced bracket.""" + return find_closure_group(input_string, start, group=["(", ")"]) + + +RE_ASSERT = re.compile(r"\bassert[ ]*\(") + + +def replace_math_functions(input_string): + """FIXME: Temporarily replace std:: invocations of math functions + with non-std:: versions to prevent linker errors NOTE: This + can lead to correctness issues when running tests, since the + correct version of the math function (exp/expf) might not get + called. Plan is to remove this function once HIP supports + std:: math function calls inside device code + + """ + output_string = input_string + for func in MATH_TRANSPILATIONS: + output_string = output_string.replace( + rf"{func}(", f"{MATH_TRANSPILATIONS[func]}(" + ) + + return output_string + + +RE_SYNCTHREADS = re.compile(r":?:?\b(__syncthreads)\b(\w*\()") + + +def hip_header_magic(input_string): + """If the file makes kernel builtin calls and does not include the cuda_runtime.h header, + then automatically add an #include to match the "magic" includes provided by NVCC. + TODO: + Update logic to ignore cases where the cuda_runtime.h is included by another file. + """ + + # Copy the input. + output_string = input_string + + # Check if one of the following headers is already included. + headers = ["hip/hip_runtime.h", "hip/hip_runtime_api.h"] + if any(re.search(rf'#include ("{ext}"|<{ext}>)', output_string) for ext in headers): + return output_string + + # Rough logic to detect if we're inside device code + hasDeviceLogic: int + hasDeviceLogic = "hipLaunchKernelGGL" in output_string + hasDeviceLogic += "__global__" in output_string + hasDeviceLogic += "__shared__" in output_string + hasDeviceLogic += RE_SYNCTHREADS.search(output_string) is not None + + # If device logic found, provide the necessary header. + if hasDeviceLogic: + output_string = '#include "hip/hip_runtime.h"\n' + input_string + + return output_string + + +RE_EXTERN_SHARED = re.compile( + r"extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;" +) + + +def replace_extern_shared(input_string): + """Match extern __shared__ type foo[]; syntax and use HIP_DYNAMIC_SHARED() MACRO instead. + Example: + "extern __shared__ char smemChar[];" => "HIP_DYNAMIC_SHARED( char, smemChar)" + "extern __shared__ unsigned char smem[];" => "HIP_DYNAMIC_SHARED( unsigned char, my_smem)" + """ + output_string = input_string + output_string = RE_EXTERN_SHARED.sub( + lambda inp: f"HIP_DYNAMIC_SHARED({inp.group(1) or ''} {inp.group(2)}, {inp.group(3)})", + output_string, + ) + + return output_string + + +def get_hip_file_path(rel_filepath, is_pytorch_extension=False): + """ + Returns the new name of the hipified file + """ + # At the moment, some PyTorch source files are HIPified in place. The predicate + # is_out_of_place tells us if this is the case or not. + assert not os.path.isabs(rel_filepath) + if not is_pytorch_extension and not is_out_of_place(rel_filepath): + return rel_filepath + + dirpath, filename = os.path.split(rel_filepath) + root, ext = os.path.splitext(filename) + + # Here's the plan: + # + # In general, we need to disambiguate the HIPified filename so that + # it gets a different name from the original filename, so + # that we don't overwrite the original file + # + # There's a lot of different naming conventions across PyTorch + # and Caffe2, but the general recipe is to convert occurrences + # of cuda/gpu to hip, and add hip if there are no occurrences + # of cuda/gpu anywhere. + # + # Concretely, we do the following: + # + # - If there is a directory component named "cuda", replace + # it with "hip", AND + # + # - If the file name contains "CUDA", replace it with "HIP", AND + # + # - ALWAYS replace '.cu' with '.hip', because those files + # contain CUDA kernels that needs to be hipified and processed with + # hip compiler + # + # - If we are not hipifying a PyTorch extension, and the parent + # directory name did not change as a result of the above + # transformations, insert "hip" in the file path + # as the direct parent folder of the file + # + # - If we are hipifying a PyTorch extension, and the parent directory + # name as well as the filename (incl. extension) did not change as + # a result of the above transformations, insert "_hip" in the filename + # + # This isn't set in stone; we might adjust this to support other + # naming conventions. + + if ext == ".cu": + ext = ".hip" + + orig_filename = filename + orig_dirpath = dirpath + + dirpath = dirpath.replace("cuda", "hip") + dirpath = dirpath.replace("CUDA", "HIP") + dirpath = dirpath.replace("THC", "THH") + + root = root.replace("cuda", "hip") + root = root.replace("CUDA", "HIP") + # Special case to handle caffe2/core/THCCachingAllocator + if dirpath != "caffe2/core": + root = root.replace("THC", "THH") + + if not is_pytorch_extension and dirpath == orig_dirpath: + dirpath = os.path.join(dirpath, "hip") + + if ( + is_pytorch_extension + and dirpath == orig_dirpath + and (root + ext) == orig_filename + ): + root = root + "_hip" + + return os.path.join(dirpath, root + ext) + + +def is_out_of_place(rel_filepath): + assert not os.path.isabs(rel_filepath) + if rel_filepath.startswith("torch/"): + return False + if rel_filepath.startswith("third_party/nvfuser/"): + return False + if rel_filepath.startswith("tools/autograd/templates/"): + return False + return True + + +# Keep this synchronized with includes/ignores in build_hygon.py +def is_pytorch_file(rel_filepath): + assert not os.path.isabs(rel_filepath) + if rel_filepath.startswith("aten/"): + if rel_filepath.startswith("aten/src/ATen/core/"): + return False + return True + if rel_filepath.startswith("torch/"): + return True + if rel_filepath.startswith("third_party/nvfuser/"): + return True + if rel_filepath.startswith("tools/autograd/templates/"): + return True + return False + + +def is_cusparse_file(rel_filepath): + if is_pytorch_file(rel_filepath): + return "sparse" in rel_filepath.lower() + return False + + +def is_special_file(rel_filepath): + if is_pytorch_file(rel_filepath): + if "sparse" in rel_filepath.lower(): + return True + elif "linalg" in rel_filepath.lower(): + if "batchlinearalgebralibblas" in rel_filepath.lower(): + return False # don't use "special" mappings for this specific linalg cublas file + return True + return False + + +def is_caffe2_gpu_file(rel_filepath): + assert not os.path.isabs(rel_filepath) + if rel_filepath.startswith("c10/cuda"): + return True + filename = os.path.basename(rel_filepath) + _, ext = os.path.splitext(filename) + return ("gpu" in filename or ext in [".cu", ".cuh"]) and ("cudnn" not in filename) + + +class TrieNode: + """A Trie node whose children are represented as a directory of char: TrieNode. + A special char '' represents end of word + """ + + def __init__(self): + self.children = {} + + +class Trie: + """Creates a Trie out of a list of words. The trie can be exported to a Regex pattern. + The corresponding Regex should match much faster than a simple Regex union.""" + + def __init__(self): + """Initialize the trie with an empty root node.""" + self.root = TrieNode() + + def add(self, word): + """Add a word to the Trie.""" + node = self.root + + for char in word: + node.children.setdefault(char, TrieNode()) + node = node.children[char] + node.children[""] = True # Mark the end of the word + + def dump(self): + """Return the root node of Trie.""" + return self.root + + def quote(self, char): + """Escape a char for regex.""" + return re.escape(char) + + def search(self, word): + """Search whether word is present in the Trie. + Returns True if yes, else return False""" + node = self.root + for char in word: + if char in node.children: + node = node.children[char] + else: + return False + + # make sure to check the end-of-word marker present + return "" in node.children + + def _pattern(self, root): + """Convert a Trie into a regular expression pattern""" + node = root + + if "" in node.children and len(node.children.keys()) == 1: + return None + + alt = [] # store alternative patterns + cc = [] # store char to char classes + q = 0 # for node representing the end of word + for char in sorted(node.children.keys()): + if isinstance(node.children[char], TrieNode): + try: + recurse = self._pattern(node.children[char]) + alt.append(self.quote(char) + recurse) + except Exception: + cc.append(self.quote(char)) + else: + q = 1 + cconly = not len(alt) > 0 + + if len(cc) > 0: + if len(cc) == 1: + alt.append(cc[0]) + else: + alt.append("[" + "".join(cc) + "]") + + if len(alt) == 1: + result = alt[0] + else: + result = "(?:" + "|".join(alt) + ")" + + if q: + if cconly: + result += "?" + else: + result = f"(?:{result})?" + return result + + def pattern(self): + """Export the Trie to a regex pattern.""" + return self._pattern(self.root) + + def export_to_regex(self): + """Export the Trie to a regex pattern.""" + return self._pattern(self.root) + + +CAFFE2_TRIE = Trie() +CAFFE2_MAP = {} +PYTORCH_TRIE = Trie() +PYTORCH_MAP: Dict[str, object] = {} + +# In PyTorch, we map cuBLAS->rocBLAS and cuSPARSE->hipSPARSE. Note the prefix, roc versus hip. +# The 'hip' APIs offer a more direct CUDA-friendly mapping, but calling rocBLAS directly has better performance. +# Unfortunately, the roc* types and hip* types differ, i.e., rocblas_float_complex versus hipComplex. +# In the case of SPARSE, we must use the hip types for complex instead of the roc types, +# but the pytorch mappings assume roc. Therefore, we create a new SPARSE mapping that has a higher priority. +# Its mappings will trigger first, and only when a miss occurs will the lower-priority pytorch mapping take place. +# When a file contains "sparse" in the filename, a mapping marked with API_SPARSE is preferred over other choices. +# Similarly, "linalg" files require rocBLAS -> hipSOLVER so they also need special handling. +PYTORCH_SPECIAL_MAP = {} + +for mapping in CUDA_TO_HIP_MAPPINGS: + assert isinstance(mapping, Mapping) + for src, value in mapping.items(): + dst = value[0] + meta_data = value[1:] + if constants.API_CAFFE2 not in meta_data: + PYTORCH_TRIE.add(src) + # if src is already in PYTORCH_MAP and dst belongs to API_SPECIAL + # do not overwrite PYTORCH_MAP, store dst separately + if constants.API_SPECIAL in meta_data and PYTORCH_MAP.get(src, ""): + PYTORCH_SPECIAL_MAP[src] = dst + else: + PYTORCH_MAP[src] = dst + if ( + constants.API_PYTORCH not in meta_data + and constants.API_SPECIAL not in meta_data + ): + CAFFE2_TRIE.add(src) + CAFFE2_MAP[src] = dst +RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.export_to_regex()) +RE_PYTORCH_PREPROCESSOR = re.compile( + rf"(?<=\W)({PYTORCH_TRIE.export_to_regex()})(?=\W)" +) + +RE_QUOTE_HEADER = re.compile(r'#include "([^"]+)"') +RE_ANGLE_HEADER = re.compile(r"#include <([^>]+)>") +RE_THC_GENERIC_FILE = re.compile(r'#define THC_GENERIC_FILE "([^"]+)"') +RE_CU_SUFFIX = re.compile(r"\.cu\b") # be careful not to pick up .cuh + +""" +Returns a HipifyResult object with the following details: + "hipified_path" : absolute path of hipified source file + "status" : "ok" if hipified file was written out + "skipped" if an identical hipified file already existed or hipified file couldn't be written out + "ignored" if the source file was a hipified file itself or not meant to be hipified + "current_state" : CurrentState.INITIALIZED if source file is first ready to be hipified + CurrentState.DONE if source file is done with hipification process +""" + + +def preprocessor( + output_directory: str, + filepath: str, + all_files: Iterable, + header_include_dirs: Iterable, + stats: Dict[str, List], + hip_clang_launch: bool, + is_pytorch_extension: bool, + clean_ctx: GeneratedFileCleaner, + show_progress: bool, +) -> HipifyResult: + """Executes the CUDA -> HIP conversion on the specified file.""" + fin_path = os.path.abspath(os.path.join(output_directory, filepath)) + hipify_result = HIPIFY_FINAL_RESULT[fin_path] + if filepath not in all_files: + hipify_result.hipified_path = None + hipify_result.status = "[ignored, not to be hipified]" + hipify_result.current_state = CurrentState.DONE + return hipify_result + + rel_filepath = os.path.relpath(filepath, output_directory) + + with open(fin_path, encoding="utf-8") as fin: + if fin.readline() == HIPIFY_C_BREADCRUMB: + hipify_result.hipified_path = None + hipify_result.status = "[ignored, input is hipified output]" + hipify_result.current_state = CurrentState.DONE + return hipify_result + fin.seek(0) + output_source = fin.read() + + orig_output_source = output_source + + # get_hip_file_path needs a relative path to work correctly + fout_path = os.path.abspath( + os.path.join( + output_directory, get_hip_file_path(rel_filepath, is_pytorch_extension) + ) + ) + if not os.path.exists(os.path.dirname(fout_path)): + clean_ctx.makedirs(os.path.dirname(fout_path)) + + # unsupported_calls statistics reporting is broken atm + def pt_repl(m): + return PYTORCH_MAP[m.group(0)] + + def pt_special_repl(m): + # checks SPECIAL map first, and if a miss occurs, falls back to pytorch mappings + return PYTORCH_SPECIAL_MAP.get(m.group(0), pt_repl(m)) + + if is_pytorch_extension: + output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source) + else: + if is_special_file(rel_filepath): + output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_special_repl, output_source) + elif is_pytorch_file(rel_filepath): + output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source) + else: + + def c2_repl(m): + return CAFFE2_MAP[m.group(0)] + + output_source = RE_CAFFE2_PREPROCESSOR.sub(c2_repl, output_source) + + # Header rewrites + def mk_repl(templ, include_current_dir=True): + def repl(m): + f = m.group(1) + dirpath, filename = os.path.split(f) + if f.startswith( + ( + "ATen/cuda", + "ATen/native/cuda", + "ATen/native/nested/cuda", + "ATen/native/quantized/cuda", + "ATen/native/sparse/cuda", + "ATen/native/transformers/cuda", + "THC/", + ) + ) or (f.startswith("THC") and not f.startswith("THCP")): + return templ.format(get_hip_file_path(m.group(1), is_pytorch_extension)) + # if filename is one of the files being hipified for this extension + if is_pytorch_extension and any(s.endswith(filename) for s in all_files): + header_dir = None + header_filepath = None + # If include_current_dir True, look first in same dir as the including source file + if include_current_dir: + header_dir_to_check = os.path.dirname(fin_path) + header_path_to_check = os.path.abspath( + os.path.join(header_dir_to_check, f) + ) + if os.path.exists(header_path_to_check): + header_dir = header_dir_to_check + header_filepath = header_path_to_check + # If not found, look in include dirs one by one and first match wins + if header_filepath is None: + for header_include_dir in header_include_dirs: + header_dir_to_check = os.path.join( + output_directory, header_include_dir + ) + header_path_to_check = os.path.abspath( + os.path.join(header_dir_to_check, f) + ) + if os.path.exists(header_path_to_check): + header_dir = header_dir_to_check + header_filepath = header_path_to_check + # If header file not found, keep as is + if header_filepath is None: + return m.group(0) + # Hipify header file first if needed + if header_filepath not in HIPIFY_FINAL_RESULT: + preprocess_file_and_save_result( + output_directory, + header_filepath, + all_files, + header_include_dirs, + stats, + hip_clang_launch, + is_pytorch_extension, + clean_ctx, + show_progress, + ) + elif header_filepath in HIPIFY_FINAL_RESULT: + header_result = HIPIFY_FINAL_RESULT[header_filepath] + if header_result.current_state == CurrentState.INITIALIZED: + # get_hip_file_path needs a relative path to work correctly + header_rel_path = os.path.relpath( + header_filepath, output_directory + ) + header_fout_path = os.path.abspath( + os.path.join( + output_directory, + get_hip_file_path( + header_rel_path, is_pytorch_extension + ), + ) + ) + header_result.hipified_path = header_fout_path + HIPIFY_FINAL_RESULT[header_filepath] = header_result + return templ.format( + os.path.relpath( + ( + header_fout_path + if header_fout_path is not None + else header_filepath + ), + header_dir, + ) + ) + hipified_header_filepath = HIPIFY_FINAL_RESULT[ + header_filepath + ].hipified_path + return templ.format( + os.path.relpath( + ( + hipified_header_filepath + if hipified_header_filepath is not None + else header_filepath + ), + header_dir, + ) + ) + + return m.group(0) + + return repl + + output_source = RE_QUOTE_HEADER.sub(mk_repl('#include "{0}"', True), output_source) + output_source = RE_ANGLE_HEADER.sub(mk_repl("#include <{0}>", False), output_source) + output_source = RE_THC_GENERIC_FILE.sub( + mk_repl('#define THC_GENERIC_FILE "{0}"'), output_source + ) + + # CMakeLists.txt rewrites + if filepath.endswith("CMakeLists.txt"): + output_source = output_source.replace("CUDA", "HIP") + output_source = output_source.replace("THC", "THH") + output_source = RE_CU_SUFFIX.sub(".hip", output_source) + + # Perform Kernel Launch Replacements + if not hip_clang_launch: + output_source = processKernelLaunches(output_source, stats) + + # Replace std:: with non-std:: versions + if (filepath.endswith((".cu", ".cuh"))) and "PowKernel" not in filepath: + output_source = replace_math_functions(output_source) + + # Include header if device code is contained. + output_source = hip_header_magic(output_source) + + # Replace the extern __shared__ + # NOTE: No longer needed after transition from hcc to hipclang. + # output_source = replace_extern_shared(output_source) + + # Don't write out identical hipified files for extensions if dirpath has not changed + if ( + is_pytorch_extension + and orig_output_source == output_source + and os.path.dirname(fin_path) == os.path.dirname(fout_path) + ): + hipify_result.hipified_path = fin_path + hipify_result.status = "[skipped, no changes]" + hipify_result.current_state = CurrentState.DONE + return hipify_result + + # Add hipify breadcrumb for C-style files to avoid re-hipification + if fin_path != fout_path and match_extensions( + fin_path, (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".hpp") + ): + output_source = HIPIFY_C_BREADCRUMB + output_source + + do_write = True + if os.path.exists(fout_path): + with open(fout_path, encoding="utf-8") as fout_old: + do_write = fout_old.read() != output_source + if do_write: + try: + with clean_ctx.open(fout_path, "w", encoding="utf-8") as fout: + fout.write(output_source) + hipify_result.hipified_path = fout_path + hipify_result.status = "[ok]" + hipify_result.current_state = CurrentState.DONE + return hipify_result + except PermissionError as e: + print( + f'{bcolors.WARNING}Failed to save {fout_path} with "{e.strerror}", leaving {fin_path} unchanged.{bcolors.ENDC}', + file=sys.stderr, + ) + hipify_result.hipified_path = fin_path + hipify_result.status = "[skipped, no permissions]" + hipify_result.current_state = CurrentState.DONE + return hipify_result + else: + hipify_result.hipified_path = fout_path + hipify_result.status = "[skipped, already hipified]" + hipify_result.current_state = CurrentState.DONE + return hipify_result + + +def file_specific_replacement(filepath, search_string, replace_string, strict=False): + with openf(filepath, "r+") as f: + contents = f.read() + if strict: + contents = re.sub( + rf"\b({re.escape(search_string)})\b", lambda x: replace_string, contents + ) + else: + contents = contents.replace(search_string, replace_string) + f.seek(0) + f.write(contents) + f.truncate() + + +def file_add_header(filepath, header): + with openf(filepath, "r+") as f: + contents = f.read() + if header[0] != "<" and header[-1] != ">": + header = f'"{header}"' + contents = (f"#include {header} \n") + contents + f.seek(0) + f.write(contents) + f.truncate() + + +def fix_static_global_kernels(in_txt): + """Static global kernels in HIP results in a compilation error.""" + in_txt = in_txt.replace(" __global__ static", "__global__") + return in_txt + + +RE_INCLUDE = re.compile(r"#include .*\n") + + +def extract_arguments(start, string): + """Return the list of arguments in the upcoming function parameter closure. + Example: + string (input): '(blocks, threads, 0, THCState_getCurrentStream(state))' + arguments (output): + '[{'start': 1, 'end': 7}, + {'start': 8, 'end': 16}, + {'start': 17, 'end': 19}, + {'start': 20, 'end': 53}]' + """ + + arguments = [] + closures = {"<": 0, "(": 0} + current_position = start + argument_start_pos = current_position + 1 + + # Search for final parenthesis + while current_position < len(string): + if string[current_position] == "(": + closures["("] += 1 + elif string[current_position] == ")": + closures["("] -= 1 + elif string[current_position] == "<": + closures["<"] += 1 + elif ( + string[current_position] == ">" + and string[current_position - 1] != "-" + and closures["<"] > 0 + ): + closures["<"] -= 1 + + # Finished all arguments + if closures["("] == 0 and closures["<"] == 0: + # Add final argument + arguments.append({"start": argument_start_pos, "end": current_position}) + break + + # Finished current argument + if ( + closures["("] == 1 + and closures["<"] == 0 + and string[current_position] == "," + ): + arguments.append({"start": argument_start_pos, "end": current_position}) + argument_start_pos = current_position + 1 + + current_position += 1 + + return arguments + + +def str2bool(v): + """ArgumentParser doesn't support type=bool. Thus, this helper method will convert + from possible string types to True / False.""" + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def hipify( + project_directory: str, + show_detailed: bool = False, + extensions: Iterable = (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".in", ".hpp"), + header_extensions: Iterable = (".cuh", ".h", ".hpp"), + output_directory: str = "", + header_include_dirs: Iterable = (), + includes: Iterable = ("*",), + extra_files: Iterable = (), + out_of_place_only: bool = False, + ignores: Iterable = (), + show_progress: bool = True, + hip_clang_launch: bool = False, + is_pytorch_extension: bool = False, + hipify_extra_files_only: bool = False, + clean_ctx: Optional[GeneratedFileCleaner] = None, +) -> HipifyFinalResult: + if project_directory == "": + project_directory = os.getcwd() + + # Verify the project directory exists. + if not os.path.exists(project_directory): + print("The project folder specified does not exist.") + sys.exit(1) + + # If no output directory, provide a default one. + if not output_directory: + project_directory.rstrip("/") + output_directory = project_directory + "_hygon" + + if project_directory != output_directory: + includes = [ + include.replace(project_directory, output_directory) for include in includes + ] + ignores = [ + ignore.replace(project_directory, output_directory) for ignore in ignores + ] + + # Copy from project directory to output directory if not done already. + if not os.path.exists(output_directory): + shutil.copytree(project_directory, output_directory) + + all_files = list( + matched_files_iter( + output_directory, + includes=includes, + ignores=ignores, + extensions=extensions, + out_of_place_only=out_of_place_only, + is_pytorch_extension=is_pytorch_extension, + ) + ) + all_files_set = set(all_files) + for f in extra_files: + if not os.path.isabs(f): + f = os.path.join(output_directory, f) + if f not in all_files_set: + all_files.append(f) + + # List all files in header_include_paths to ensure they are hipified + from pathlib import Path + + for header_include_dir in header_include_dirs: + if os.path.isabs(header_include_dir): + header_include_dir_path = Path(header_include_dir) + else: + header_include_dir_path = Path( + os.path.join(output_directory, header_include_dir) + ) + for path in header_include_dir_path.rglob("*"): + if ( + path.is_file() + and _fnmatch(str(path), includes) + and (not _fnmatch(str(path), ignores)) + and match_extensions(path.name, header_extensions) + ): + all_files.append(str(path)) + + if clean_ctx is None: + clean_ctx = GeneratedFileCleaner(keep_intermediates=True) + + # Preprocessing statistics. + stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []} + + for filepath in all_files if not hipify_extra_files_only else extra_files: + preprocess_file_and_save_result( + output_directory, + filepath, + all_files, + header_include_dirs, + stats, + hip_clang_launch, + is_pytorch_extension, + clean_ctx, + show_progress, + ) + + print( + bcolors.OKGREEN + + "Successfully preprocessed all matching files." + + bcolors.ENDC, + file=sys.stderr, + ) + + # Show detailed summary + if show_detailed: + compute_stats(stats) + + return HIPIFY_FINAL_RESULT diff --git a/aiter/jit/utils/torch_guard.py b/aiter/jit/utils/torch_guard.py new file mode 100644 index 0000000000000000000000000000000000000000..83a5621f4e0394c27fd75b4271fab80565579912 --- /dev/null +++ b/aiter/jit/utils/torch_guard.py @@ -0,0 +1,347 @@ +# SPDX-License-Identifier: MIT + +from packaging import version +from packaging.version import Version +import importlib +from typing import Any, Callable, Optional, Union, List, get_args, get_origin + + +aiter_lib = None + + +def is_torch_equal_or_newer(target: str) -> bool: + """Check if the installed torch version is >= the target version. + + Args: + target: a version string, like "2.6.0". + + Returns: + Whether the condition meets. + """ + import torch + + try: + return _is_torch_equal_or_newer(str(torch.__version__), target) + except Exception: + # Fallback to PKG-INFO to load the package info, needed by the doc gen. + return Version(importlib.metadata.version("torch")) >= Version(target) + + +# Helper function used in testing. +def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool: + torch_version = version.parse(torch_version) + return torch_version >= version.parse(target) + + +MANUAL_SCHEMA_OPS = [ + "register_graph_buffers", + # "module_moe_ck2stages", + # "mha_fwd", + # "fmha_v3_fwd", + # "mha_varlen_fwd", + # "mha_bwd", + # "fmha_v3_bwd", + # "mha_varlen_bwd", + # "fmha_v3_varlen_bwd", + # "fmha_v3_varlen_fwd", + # "mha_batch_prefill", + "hipb_findallsols", + "rocb_findallsols", + "_ActivationType", + "_QuantType", + "init_custom_ar", + # "greedy_sample", + # "random_sample", + # "mixed_sample", + # "exponential", +] + +NONE_WRAPPED_OP = [ + "hipb_create_extension", + # "hipb_destroy_extension", + # "getHipblasltKernelName", + # "rocb_create_extension", + # "rocb_destroy_extension", + "get_meta_buffer_ipc_handle", + "get_graph_buffer_ipc_meta", + "asm_moe_get_solutions", + "ck_moe_get_solutions", + "_ActivationType", + "_QuantType", + "get_moe_asm_solution", + # "allocate_meta_buffer", + # "dispose", + # "meta_size", + # "get_padded_m", + # "compile_mha_fwd", + # "compile_mha_bwd", + "init_custom_qr", + # "qr_max_size", + # "qr_destroy", + # "qr_open_handles", + # "qr_get_handle", +] + + +def generate_schema(func, mutates_args: Union[list[str], str] = "unknown") -> str: + import inspect + + import torch + + sig = inspect.signature(func) + parameters = [] + for idx, (name, param) in enumerate(sig.parameters.items()): + param_type = param.annotation + flag = True + is_mutates = True + if mutates_args != "unknown" and name not in mutates_args: + is_mutates = False + + if param_type is torch.Tensor: + if is_mutates: + type_str = f"Tensor(a{idx}!)" + else: + type_str = "Tensor" + elif param_type == Optional[torch.Tensor]: + if is_mutates: + type_str = f"Tensor(a{idx}!)?" + else: + type_str = "Tensor?" + elif get_origin(param_type) is Union and torch.Tensor in get_args(param_type): + if is_mutates: + type_str = f"Tensor(a{idx}!)?" + else: + type_str = "Tensor?" + elif param_type in (torch.SymInt, int): + type_str = "SymInt" + elif param_type in (float, bool, str): + type_str = param_type.__name__ + elif param_type == Optional[torch.Generator]: + type_str = "Generator?" + elif ( + get_origin(param_type) in (list, List) + and get_args(param_type)[0] is torch.Tensor + ): + if is_mutates: + type_str = f"Tensor(a{idx}!)[]" + else: + type_str = "Tensor[]" + elif get_origin(param_type) in (list, List) and get_args(param_type)[0] is int: + type_str = "int[]" + elif param_type == Optional[torch.dtype]: + type_str = "ScalarType?" + else: + type_str = "*" + flag = False + if flag: + param_str = f"{type_str} {name}" + + if param.default != inspect.Parameter.empty: + if param.default is None: + param_str += "=None" + else: + param_str += f"={param.default}" + else: + param_str = f"{type_str} " + + parameters.append(param_str) + return_annotation = sig.return_annotation + return_type = "" + if return_annotation is type(None) or return_annotation is None: + return_type = "()" + elif return_annotation is torch.Tensor: + return_type = "Tensor" + elif ( + get_origin(return_annotation) is list and get_args(return_annotation)[0] is int + ): + return_type = "int[]" + elif return_annotation is int: + return_type = "int" + elif return_annotation is float: + return_type = "float" + elif return_annotation is bool: + return_type = "bool" + elif ( + get_origin(return_annotation) is list + and get_args(return_annotation)[0] is torch.Tensor + ): + return_type = "Tensor[]" + elif get_origin(return_annotation) is tuple: + args = get_args(return_annotation) + type_strings = [] + for arg in args: + if arg is torch.Tensor: + type_strings.append("Tensor") + elif arg is int: + type_strings.append("int") + elif arg is float: + type_strings.append("float") + elif arg is bool: + type_strings.append("bool") + return_type = f"({', '.join(type_strings)})" + else: + return_type = "Any" + + schema = f"({', '.join(parameters)}) -> {return_type}" + + return schema + + +def torch_compile_guard( + mutates_args: Union[list[str], str] = "unknown", + device: str = "cpu", + calling_func_: Optional[Callable[..., Any]] = None, + gen_fake: Optional[Callable[..., Any]] = None, +): + def decorator(func): + # In core.py, we calling wrapper, but actually we need use aiter.op func + calling_func = calling_func_ if calling_func_ is not None else func + + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + try: + import torch + from torch.library import Library + import inspect + except ImportError: + return wrapper + + if calling_func.__name__ in NONE_WRAPPED_OP: + return wrapper + + def wrapper_register(calling_func): + import inspect + + import torch + import torch.library + from torch.library import Library + + global aiter_lib + aiter_lib = Library("aiter", "FRAGMENT") if aiter_lib is None else aiter_lib + schema = "" + if calling_func.__name__ in MANUAL_SCHEMA_OPS: + schema = generate_schema(calling_func) + else: + sig = inspect.signature(calling_func) + if hasattr(torch.library, "infer_schema"): + schema = torch.library.infer_schema( + calling_func, mutates_args=mutates_args + ) + else: + # for pytorch 2.4 + import torch._custom_op.impl + + # torch 2.4 not support mutates "unknown" for inplace all param + if mutates_args == "unknown": + mutates_args_custom = [] + + for param_name, param in sig.parameters.items(): + if param.annotation == torch.Tensor: + mutates_args_custom.append(param_name) + + schema = torch._custom_op.impl.infer_schema( + calling_func, mutates_args_custom + ) + return schema + + schema = wrapper_register(calling_func) + + sig = inspect.signature(calling_func) + input_is_tensor = False + parameters = list(sig.parameters.values()) + + if parameters: + first_param = parameters[0] + if ( + first_param.annotation is not inspect.Parameter.empty + and first_param.annotation is torch.Tensor + ): + input_is_tensor = True + + input_part, output_part = schema.split("->", 1) + if input_is_tensor: + new_input = input_part + else: + if not sig.parameters: + new_input = "(Tensor dummy)" + else: + new_input = "(Tensor dummy, " + input_part[1:] + + return_non_tensor = False + return_annotation = sig.return_annotation + if return_annotation in [int, bool, float]: + output_part = "(Tensor, " + output_part + ")" + return_non_tensor = True + + schema = f"{new_input} -> {output_part}".strip() + + loadName = calling_func.__name__ + + def wrapper_custom(*args, **kwargs): + result = ( + getattr(torch.ops.aiter, f"{loadName}")(*args, **kwargs) + if input_is_tensor + else getattr(torch.ops.aiter, f"{loadName}")( + torch.empty(1, device=device), *args, **kwargs + ) + ) + return result[1] if return_non_tensor else result + + if hasattr(torch.ops.aiter, loadName): + return wrapper_custom + + def abstract_impl(*args, **kwargs): + if gen_fake is not None: + if return_non_tensor: + return torch.empty(1, device=device), gen_fake(*args, **kwargs) + else: + return gen_fake(*args, **kwargs) + if return_non_tensor: + return torch.empty(1, device=device), calling_func(*args, **kwargs) + return calling_func(*args, **kwargs) + + def outer_wrapper(*args, **kwargs): + return ( + wrapper(*args, **kwargs) + if not return_non_tensor + else (torch.empty(1, device=device), wrapper(*args, **kwargs)) + ) + + def abstract_impl_dummy(dummy, *args, **kwargs): + if gen_fake is not None: + if return_non_tensor: + return torch.empty(1, device=device), gen_fake(*args, **kwargs) + else: + return gen_fake(*args, **kwargs) + if return_non_tensor: + return torch.empty(1, device=device), calling_func(*args, **kwargs) + return calling_func(*args, **kwargs) + + def outer_wrapper_dummy(dummy, *args, **kwargs): + return ( + wrapper(*args, **kwargs) + if not return_non_tensor + else (torch.empty(1, device=device), wrapper(*args, **kwargs)) + ) + + custom_func = outer_wrapper + fake_func = abstract_impl + if not input_is_tensor: + custom_func = outer_wrapper_dummy + fake_func = abstract_impl_dummy + + if not hasattr(torch.ops.aiter, calling_func.__name__): + if is_torch_equal_or_newer("2.8.0"): + tags = () + else: + tags = (torch.Tag.needs_fixed_stride_order,) + op_schema = f"aiter::{loadName}" + schema + aiter_lib.define(op_schema, tags=tags) + aiter_lib.impl(f"aiter::{loadName}", custom_func, dispatch_key="CUDA") + aiter_lib.impl(f"aiter::{loadName}", custom_func, dispatch_key="CPU") + aiter_lib._register_fake(f"{loadName}", fake_func) + + return wrapper_custom + + return decorator \ No newline at end of file diff --git a/aiter/mla.py b/aiter/mla.py new file mode 100644 index 0000000000000000000000000000000000000000..018bde5d0feedfc6282154fedd7d6b46329e067e --- /dev/null +++ b/aiter/mla.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: MIT + +# user interface + +import torch +import aiter +from aiter import dtypes +import triton +import triton.language as tl +import functools +from .jit.utils.chip_info import get_cu_num + + +@triton.jit +def _fwd_kernel_stage2_asm( + Mid_O, + Mid_lse, + O, + qo_indptr, + kv_indptr, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + stride_obs, + stride_oh, + NUM_KV_SPLITS: tl.constexpr, + BLOCK_DV: tl.constexpr, + Lv: tl.constexpr, + mgc: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + cur_qo_offs = tl.program_id(2) + + cur_qo_start = tl.load(qo_indptr + cur_batch) + cur_qo_end = tl.load(qo_indptr + cur_batch + 1) + cur_qo = cur_qo_start + cur_qo_offs + if cur_qo > cur_qo_end: + return + cur_kv_seq_len = tl.load(kv_indptr + cur_batch + 1) - tl.load(kv_indptr + cur_batch) + + offs_d = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lv + + e_sum = 0.0 + e_max = -float("inf") + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + offs_v = (cur_qo * stride_mid_ob + cur_head * stride_mid_oh) * Lv + offs_d + offs_logic = cur_qo * stride_mid_ob + cur_head * stride_mid_oh + + for split_kv_id in range(0, NUM_KV_SPLITS): + kv_len_per_split = tl.maximum(mgc, tl.cdiv(cur_kv_seq_len, NUM_KV_SPLITS)) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_kv_seq_len) + + if split_kv_end > split_kv_start: + tv = tl.load( + Mid_O + offs_v + split_kv_id * stride_mid_os * Lv, + mask=mask_d, + other=0.0, + ) + tlogic = tl.load(Mid_lse + offs_logic + split_kv_id * stride_mid_os) + n_e_max = tl.maximum(tlogic, e_max) + + old_scale = tl.exp(e_max - n_e_max) + acc *= old_scale + exp_logic = tl.exp(tlogic - n_e_max) + acc += exp_logic * tv + + e_sum = e_sum * old_scale + exp_logic + e_max = n_e_max + + tl.store( + O + cur_qo * stride_obs + cur_head * stride_oh + offs_d, + acc / e_sum, + mask=mask_d, + ) + + +@functools.lru_cache() +def get_meta_param(num_kv_splits, device, bs, nhead): + if num_kv_splits is None: + cu_num = get_cu_num() + num_kv_splits = min(16, max(1, cu_num // bs)) + + get_mgc = {16: 64, 128: 16} + assert nhead in get_mgc, f"{nhead=} not supported" + mgc = get_mgc[nhead] + return num_kv_splits, mgc + + +def mla_decode_fwd( + q, + kv_buffer, + o, + qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + max_seqlen_q, + sm_scale=None, # 1.0 / (qk_head_dim**0.5) + logit_cap=0.0, + num_kv_splits=None, # for experts only!!! +): + device = q.device + assert logit_cap <= 0, f"{logit_cap=} is not support yet" + num_page, page_size, nhead_kv, qk_head_dim = kv_buffer.shape + if sm_scale is None: + sm_scale = 1.0 / (qk_head_dim**0.5) + + total_s, nhead, v_head_dim = o.shape + bs = qo_indptr.shape[0] - 1 + + num_kv_splits, mgc = get_meta_param(num_kv_splits, device, bs, nhead) + + if nhead == 16: + logits = torch.empty( + (total_s, num_kv_splits, nhead, v_head_dim), + dtype=dtypes.fp32, + device=device, + ) + assert ( + max_seqlen_q == 1 + ), f"Assertion: max_seqlen_q should be 1 when n_head=16, but got {max_seqlen_q}" + elif nhead == 128: + logits = ( + o.view((total_s, num_kv_splits, nhead, v_head_dim)) + if num_kv_splits == 1 + else torch.empty( + (total_s, num_kv_splits, nhead, v_head_dim), + dtype=dtypes.fp32, + device=device, + ) + ) + else: + assert False, f"{nhead=} not supported" + + attn_lse = torch.empty( + (total_s, num_kv_splits, nhead, 1), dtype=dtypes.fp32, device=device + ) + + aiter.mla_decode_stage1_asm_fwd( + q, + kv_buffer, + qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + max_seqlen_q, + sm_scale, + logits, + attn_lse, + ) + + if num_kv_splits == 1 and nhead == 128: + return logits.view(total_s, nhead, v_head_dim), attn_lse + Lv = v_head_dim + BLOCK_DV = triton.next_power_of_2(Lv) + grid = (bs, nhead, max_seqlen_q) + extra_kargs = {"waves_per_eu": 4} + _fwd_kernel_stage2_asm[grid]( + logits, + attn_lse, + o, + qo_indptr, + kv_indptr, + attn_lse.stride(0), + attn_lse.stride(2), + attn_lse.stride(1), + o.stride(0), + o.stride(1), + NUM_KV_SPLITS=num_kv_splits, + BLOCK_DV=BLOCK_DV, + Lv=Lv, + mgc=mgc, + num_warps=4, + num_stages=2, + **extra_kargs, + ) + return logits, attn_lse + + +def mla_prefill_fwd( + q, # [num_seqs, num_heads, head_size] + kv_buffer, # [num_page, page_size, num_kv_heads, kv_lora_rank + qk_rope_head_dim] + o, # [num_seqs, num_heads, v_head_dim] + qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + max_seqlen_q, + sm_scale=None, # 1.0 / (qk_head_dim**0.5) + logit_cap=0.0, + num_kv_splits=None, # for experts only!!! +): + device = q.device + assert logit_cap <= 0, f"{logit_cap=} is not support yet" + if sm_scale is None: + sm_scale = 1.0 / (qk_head_dim**0.5) + + num_page, page_size, nhead_kv, qk_head_dim = kv_buffer.shape + bs, nhead, v_head_dim = o.shape + + num_kv_splits = 1 + + logits = o.view(bs, num_kv_splits, nhead, v_head_dim) + # logits = torch.empty( + # (bs, num_kv_splits, nhead, v_head_dim), dtype=dtypes.fp32, device=device + # ) + attn_lse = torch.empty( + (bs, num_kv_splits, nhead, 1), dtype=dtypes.fp32, device=device + ) + + aiter.mla_prefill_asm_fwd( + q, + kv_buffer, + qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + max_seqlen_q, + sm_scale, + logits, + attn_lse, + ) + + # return logits.view(bs, nhead, v_head_dim).to(o.dtype), attn_lse + return o.view(bs, nhead, v_head_dim), attn_lse diff --git a/aiter/moe.py b/aiter/moe.py new file mode 100644 index 0000000000000000000000000000000000000000..8b0ce73e600d19cab845c93c195c8b1d7af7538f --- /dev/null +++ b/aiter/moe.py @@ -0,0 +1,518 @@ +# SPDX-License-Identifier: MIT +import logging +import torch +from dataclasses import dataclass +from typing import Optional, Dict, Any, Tuple, List + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +class MoeSolutionType: + MOE_C = "moe_c" + ASM = "asm" + TRITON = "triton" + CK = "ck" + + +class MoeQuantType: + """Quantization types supported by get_aiter_moe_config / aiter_moe.""" + W16A16 = "w16a16" + W4A16 = "w4a16" + W8A8 = "w8a8" + W4A8 = "w4a8" + + +@dataclass +class AiterMoeConfig: + """Config returned by :func:`get_aiter_moe_config`. + + Attributes: + quant_type: The quantization type this config was obtained for. + solution_type: Which backend to use (MoeSolutionType constant), or + None if no solution was found. + config: Backend-specific config dict (opaque to the caller). + """ + quant_type: Optional[str] = None + solution_type: Optional[str] = None + config: Optional[Dict[str, Any]] = None + + +def _pick_closest_config(configs: Dict[int, Any], m: int) -> Dict[str, Any]: + return configs[min(configs.keys(), key=lambda x: abs(x - m))] + + +def _try_get_moe_c_config( + quant_type: str, + m: int, + e: int, + n: int, + block_size: int, +) -> Optional[Dict[str, Any]]: + try: + if quant_type == MoeQuantType.W4A16: + from .fused_moe_c import get_moe_configs_marlin + + configs = get_moe_configs_marlin( + E=e, + N=n, + dtype="int4_w4a16", + is_bottom=False, + use_moe_wna16_cuda=True, + ) + elif quant_type == MoeQuantType.W8A8: + from .fused_moe_c import get_moe_configs_marlin + + configs = get_moe_configs_marlin( + E=e, + N=n, + dtype="int8_w8a8", + is_bottom=False, + use_moe_wna16_cuda=True, + ) + elif quant_type == MoeQuantType.W4A8: + from .fused_moe_c import get_moe_configs_marlin + + configs = get_moe_configs_marlin( + E=e, + N=n, + dtype="int8_w4a8", + block_n = block_size, + block_k = block_size, + is_bottom=False, + use_moe_wna16_cuda=True, + ) + else: + return None + + if configs is None: + return None + return _pick_closest_config(configs, m) + except Exception as exc: + logger.debug("moe_c config lookup failed for %s: %s", quant_type, exc) + return None + + +def _try_get_asm_config( + quant_type: str, + m: int, + e: int, + n: int, + k: int, + top_k: int, + block_size: Optional[int], +) -> Optional[Dict[str, Any]]: + try: + from .fused_moe_asm_wna16 import get_moe_asm_solution, MoeQuantType as AsmMoeQuantType + from .jit.utils.chip_info import get_gfx + + arch = get_gfx() + + if quant_type == MoeQuantType.W4A16: + from .fused_moe_asm_wna16 import decode_sol_w4a16, decode_sol_w4a16_gw32 + if block_size == 32: + if top_k > 8 or n != 256 or k != 7168: + return None + else: + return decode_sol_w4a16_gw32() + + solution = get_moe_asm_solution( + arch=arch, + token=m, + inter_dim=n, + model_dim=k, + expert=e, + topk=top_k, + quant_type=AsmMoeQuantType.INT4_W4A16, + ) + if solution == "default": + return None + return decode_sol_w4a16(solution) + + if quant_type == MoeQuantType.W8A8: + from .fused_moe_asm_wna16 import decode_sol_0 + + solution = get_moe_asm_solution( + arch=arch, + token=m, + inter_dim=n, + model_dim=k, + expert=e, + topk=top_k, + quant_type=AsmMoeQuantType.INT8_W8A8, + ) + if solution == "default": + return None + return decode_sol_0(solution) + + if quant_type == MoeQuantType.W16A16: + from .fused_moe_asm_wna16 import decode_sol_0 + + solution = get_moe_asm_solution( + arch=arch, + token=m, + inter_dim=n, + model_dim=k, + expert=e, + topk=top_k, + quant_type=AsmMoeQuantType.NO_QUANT, + ) + if solution == "default": + return None + return decode_sol_0(solution) + + return None + except Exception as exc: + logger.debug("ASM config lookup failed for %s: %s", quant_type, exc) + return None + + +def _try_get_triton_config( + quant_type: str, + m: int, + e: int, + n: int, + block_size: int, +) -> Optional[Dict[str, Any]]: + try: + from .ops.triton.utils.moe_config_utils import get_moe_configs as triton_get_moe_configs + + if quant_type == MoeQuantType.W16A16: + return {} # Non-quantized; no tuned config lookup needed + + dtype_name = { + MoeQuantType.W4A16: "int4_w4a16", + MoeQuantType.W8A8: "int8_w8a8", + }.get(quant_type) + if dtype_name is None: + return None + + configs = triton_get_moe_configs( + E=e, + N=n, + dtype=dtype_name, + block_n=0, + block_k=block_size if block_size else 0, + is_bottom=False, + ) + if configs is None: + return None + return _pick_closest_config(configs, m) + except Exception as exc: + logger.debug("Triton config lookup failed for %s: %s", quant_type, exc) + return None + + +def _try_get_ck_config( + quant_type: str, + m: int, + e: int, + n: int, + k: int, + top_k: int, + block_shape: Optional[List[int]], +) -> Optional[Dict[str, Any]]: + try: + if quant_type != MoeQuantType.W8A8: + return None + + from .fused_moe_ck import get_moe_ck_solution_id, MoeQuantType as CkMoeQuantType + from .jit.utils.chip_info import get_gfx + + arch = get_gfx() + q_size_n = block_shape[0] if block_shape is not None else 0 + q_size_k = block_shape[1] if block_shape is not None else 0 + solution_id = get_moe_ck_solution_id( + arch, + CkMoeQuantType.INT8_W8A8, + m, + n, + k, + e, + top_k, + q_size_n, + q_size_k, + ) + return {"solution_id": solution_id} + except Exception as exc: + logger.debug("CK config lookup failed for %s: %s", quant_type, exc) + return None + + +def get_aiter_moe_config( + M: int, # Number of tokens (input sequence length) + E: int, # Number of experts + N1: int, # GEMM1 output dimension, typically equal to (moe_intermediate_size / TP * 2) + N2: int, # GEMM2 output dimension, typically equal to hidden_size + K: int, # GEMM1 input dimension, typically equal to hidden_size; for GEMM2, K typically equal to (moe_intermediate_size / TP) + top_k: int, + block_size: int, + dtype: torch.dtype, + quant_type: str, +) -> Tuple[bool, AiterMoeConfig]: + """Get the best backend config for a MOE problem. + + Currently supported quant types: + - ``MoeQuantType.W16A16`` (non-quantized) + - ``MoeQuantType.W4A16`` + - ``MoeQuantType.W8A8`` + - ``MoeQuantType.W4A8`` + + Backend priority: + - ``w16a16``: asm > triton + - ``w4a16``: moe_c > asm > triton + - ``w8a8``: asm > moe_c > triton > ck + - ``w4a8``: moe_c + """ + n = N1 / 2 + block_shape = [0, block_size] if block_size else None + + if quant_type == MoeQuantType.W4A16: + if dtype == torch.float16: + candidates = [ + (MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)), + ] + elif dtype == torch.bfloat16: + candidates = [ + (MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k, block_size)), + (MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)), + ] + else: + raise ValueError(f"Unsupported dtype: {dtype}") + elif quant_type == MoeQuantType.W8A8: + if block_size == 0: # Channel wise choose MOE_C + candidates = [ + (MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)), + (MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)), + # (MoeSolutionType.CK, lambda: _try_get_ck_config(quant_type, M, E, n, K, top_k, block_shape)), + ] + else: # Block wise choose ASM + candidates = [ + (MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k, block_size)), + ] + + elif quant_type == MoeQuantType.W4A8: + candidates = [ + (MoeSolutionType.MOE_C, lambda: _try_get_moe_c_config(quant_type, M, E, n, block_size)), + # (MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k)), + ] + elif quant_type == MoeQuantType.W16A16: + candidates = [ + (MoeSolutionType.ASM, lambda: _try_get_asm_config(quant_type, M, E, n, K, top_k, None)), + (MoeSolutionType.TRITON, lambda: _try_get_triton_config(quant_type, M, E, n, block_size)), + ] + else: + raise ValueError(f"Unsupported quant_type: {quant_type}") + + for solution_type, get_config in candidates: + config = get_config() + if config is not None: + return True, AiterMoeConfig( + quant_type=quant_type, + solution_type=solution_type, + config=config, + ) + + return False, AiterMoeConfig(quant_type=quant_type) + + +def aiter_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + moe_config: AiterMoeConfig, + inplace: Optional[bool] = False, + activation: str = "silu", + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + routed_scaling_factor: Optional[float] = 1.0, +) -> torch.Tensor: + """Execute MOE using the backend and quant type described by *moe_config*.""" + if moe_config.solution_type is None or moe_config.quant_type is None: + raise ValueError( + "moe_config has no valid solution_type/quant_type. " + "Call get_aiter_moe_config first and check the status." + ) + + use_int4_w4a16 = moe_config.quant_type == MoeQuantType.W4A16 + use_int8_w8a8 = moe_config.quant_type == MoeQuantType.W8A8 + use_int8_w4a8 = moe_config.quant_type == MoeQuantType.W4A8 + + if moe_config.solution_type == MoeSolutionType.MOE_C: + from .fused_moe_c import moe_c_fused_experts + + return moe_c_fused_experts( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + inplace=inplace, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a8=use_int8_w8a8, + use_int8_w4a8=use_int8_w4a8, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + routed_scaling_factor=routed_scaling_factor + ) + + if moe_config.solution_type == MoeSolutionType.ASM: + from .fused_moe_asm_wna16 import fused_experts_asm_impl + + cfg = moe_config.config + solution_id = f"{cfg['SOL_ID1']}+{cfg['SOL_ID2']}" + return fused_experts_asm_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + dtype=hidden_states.dtype, + inplace=inplace, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a8=use_int8_w8a8, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + solution_id=solution_id, + routed_scaling_factor=routed_scaling_factor + ) + + if moe_config.solution_type == MoeSolutionType.TRITON: + from .ops.triton.fused_moe import fused_experts_impl + + # W8A8 channel-wise (block_shape=None) requires per_channel_quant=True + per_channel_quant = use_int8_w8a8 and block_shape is None + + return fused_experts_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + odtype=hidden_states.dtype, + inplace=inplace, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a8=use_int8_w8a8, + activation=activation, + per_channel_quant=per_channel_quant, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + routed_scaling_factor=routed_scaling_factor + ) + + if moe_config.solution_type == MoeSolutionType.CK: + from .fused_moe_ck import run_fused_experts_ck_impl + + solution_id = moe_config.config["solution_id"] + return run_fused_experts_ck_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + odtype=hidden_states.dtype, + inplace=inplace, + use_int8_w8a8=use_int8_w8a8, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + routed_scaling_factor=routed_scaling_factor, + solution_id=solution_id, + ) + + raise ValueError(f"Unknown solution_type: {moe_config.solution_type}") + + +def get_aiter_moe_config_w4a16( + M: int, + E: int, + N1: int, + N2: int, + K: int, + top_k: int, + block_size: int, + dtype: torch.dtype, +) -> Tuple[bool, AiterMoeConfig]: + """Backward-compatible wrapper for w4a16 config lookup.""" + return get_aiter_moe_config(M, E, N1, N2, K, top_k, block_size, dtype, MoeQuantType.W4A16) + + +def aiter_moe_w4a16( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + moe_config: AiterMoeConfig, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[list] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + activation: str = "silu", +) -> torch.Tensor: + """Backward-compatible wrapper for w4a16 execution.""" + return aiter_moe( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + moe_config=moe_config, + activation=activation, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) diff --git a/aiter/moe_c_configs/E=128,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=128,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=128,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=128,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=128,N=768,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=1024,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=256,N=128,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..83a038e8e8607dd4363208f3ce57d85dcc6f7e43 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 28, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=256,N=128,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..cc8ea14789de27f64e5c9c9680184fc53d17b837 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=fp8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=fp8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=fp8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=fp8_w8a8.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=fp8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=256,N=2048,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..83a038e8e8607dd4363208f3ce57d85dcc6f7e43 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 28, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=256,N=2048,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..cc8ea14789de27f64e5c9c9680184fc53d17b837 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=2048,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..f589fa4267a03989cf23af9f834a734a53319a39 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,86 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..ee625c093696aaa984c44c5597056958968a0808 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16.json @@ -0,0 +1,86 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 0 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=64,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=256,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx936,num_cus=80,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=64,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8b42cb27d0153ed65199435f79208239b03e2d --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 32 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 118 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 166 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..9d15ed418196961cbdcf8956dcc02dd4f9fe8613 --- /dev/null +++ b/aiter/moe_c_configs/E=256,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w4a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 16 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "MODE": 290 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=32,N=1024,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=32,N=1024,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..83a038e8e8607dd4363208f3ce57d85dcc6f7e43 --- /dev/null +++ b/aiter/moe_c_configs/E=32,N=1024,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 28, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=32,N=1024,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=32,N=1024,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..cc8ea14789de27f64e5c9c9680184fc53d17b837 --- /dev/null +++ b/aiter/moe_c_configs/E=32,N=1024,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=32,N=2048,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=32,N=2048,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..83a038e8e8607dd4363208f3ce57d85dcc6f7e43 --- /dev/null +++ b/aiter/moe_c_configs/E=32,N=2048,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 2, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 4, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 7, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 28, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 56, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=32,N=2048,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json b/aiter/moe_c_configs/E=32,N=2048,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json new file mode 100644 index 0000000000000000000000000000000000000000..cc8ea14789de27f64e5c9c9680184fc53d17b837 --- /dev/null +++ b/aiter/moe_c_configs/E=32,N=2048,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128],is_cuda_kernel=True.json @@ -0,0 +1,191 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "1024": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "4096": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "8192": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "16384": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + }, + "32768": { + "BLOCK_SIZE_M": 48, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "kloops": 1, + "nloops": 4 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16,is_bottom=True.json b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..079e76c1e08c35744b248905dae6fcf9d4de2eb5 --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,102 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "8192": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16.json b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..2f659fc79e6082c02425c13db583b31ff266a481 --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx936,num_cus=80,dtype=int4_w4a16.json @@ -0,0 +1,102 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "8192": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=64,dtype=int4_w4a16,is_bottom=True.json b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=64,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..079e76c1e08c35744b248905dae6fcf9d4de2eb5 --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=64,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,102 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "8192": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=64,dtype=int4_w4a16.json b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=64,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..2f659fc79e6082c02425c13db583b31ff266a481 --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=64,dtype=int4_w4a16.json @@ -0,0 +1,102 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "8192": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=72,dtype=int4_w4a16,is_bottom=True.json b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=72,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..079e76c1e08c35744b248905dae6fcf9d4de2eb5 --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=72,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,102 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + }, + "8192": { + "BLOCK_SIZE_M": 16, + "MODE": 2 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=72,dtype=int4_w4a16.json b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=72,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..2f659fc79e6082c02425c13db583b31ff266a481 --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=256,gfx_version=gfx938,num_cus=72,dtype=int4_w4a16.json @@ -0,0 +1,102 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + }, + "8192": { + "BLOCK_SIZE_M": 16, + "MODE": 5 + } +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=384,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=384,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=384,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=384,N=384,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=128,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx936,num_cus=80,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=fp8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=256,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json b/aiter/moe_c_configs/E=512,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc799e8113b4b4c65d04ef461d4a33432ba6cdb --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,112 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 42 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 38 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 46 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 43 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 86 + + } + +} \ No newline at end of file diff --git a/aiter/moe_c_configs/E=512,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json b/aiter/moe_c_configs/E=512,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..499fc19617d2de1f0938bd950bb1865a74a56a3a --- /dev/null +++ b/aiter/moe_c_configs/E=512,N=512,gfx_version=gfx938,num_cus=72,dtype=int8_w8a8.json @@ -0,0 +1,111 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "2": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "3": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "4": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "5": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "6": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "7": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "8": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "9": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "10": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "11": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "12": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "13": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "14": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "15": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "16": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "32": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "64": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "128": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "256": { + "BLOCK_SIZE_M": 16, + "MODE": 121 + }, + "512": { + "BLOCK_SIZE_M": 16, + "MODE": 98 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "MODE": 183 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "MODE": 146 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "MODE": 160 + } + +} \ No newline at end of file diff --git a/aiter/ops/__init__.py b/aiter/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0bde01b04e250caa825555c8a1926b3fbcb23ad --- /dev/null +++ b/aiter/ops/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT + \ No newline at end of file diff --git a/aiter/ops/activation.py b/aiter/ops/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..c89a06296c3fa58abb071369fccd670efca869a4 --- /dev/null +++ b/aiter/ops/activation.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: MIT +from torch import Tensor +from ..jit.core import compile_ops + + +MD_NAME = "module_activation" + + +@compile_ops("module_activation") +def silu_and_mul(out: Tensor, input: Tensor) -> None: ... + + +@compile_ops("module_activation") +def scaled_silu_and_mul(out: Tensor, input: Tensor, scale: Tensor) -> None: ... + + +@compile_ops("module_activation") +def gelu_and_mul(out: Tensor, input: Tensor) -> None: ... + + +@compile_ops("module_activation") +def gelu_tanh_and_mul(out: Tensor, input: Tensor) -> None: ... diff --git a/aiter/ops/aiter_operator.py b/aiter/ops/aiter_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..9de5578295030dc1bd363a942c17fba73b1fe27a --- /dev/null +++ b/aiter/ops/aiter_operator.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: MIT +from torch import Tensor +from ..jit.core import compile_ops + +MD_NAME = "module_aiter_operator" + + +@compile_ops("module_aiter_operator") +def add(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_operator") +def sub(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_operator") +def mul(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_operator") +def div(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_operator") +def add_(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_operator") +def sub_(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_operator") +def mul_(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_operator") +def div_(input: Tensor, other: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_unary") +def sigmoid(input: Tensor) -> Tensor: ... + + +@compile_ops("module_aiter_unary") +def tanh(input: Tensor) -> Tensor: ... diff --git a/aiter/ops/attention.py b/aiter/ops/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..2945dc425ba1e8f0f7387ace771c4d72d9984daa --- /dev/null +++ b/aiter/ops/attention.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: MIT +import torch +from typing import Optional +from ..jit.core import ( + compile_ops, +) + +MD_NAME = "module_attention" + + +@compile_ops("module_attention") +def pa_fwd_naive( + # [num_seqs, num_heads, head_size] + query: torch.Tensor, + # [num_blocks, num_kv_heads, head_size/x, block_size, x] + key_cache: torch.Tensor, + # [num_blocks, num_kv_heads, head_size, block_size] + value_cache: torch.Tensor, + # [num_seqs, max_num_blocks_per_seq] + block_tables: torch.Tensor, + # [num_seqs] + context_lens: torch.Tensor, + k_dequant_scales: torch.Tensor, + v_dequant_scales: torch.Tensor, + max_seq_len: int, + num_kv_heads: int, + scale_s: float, + scale_k: float, + scale_v: float, + block_size: int, + quant_algo: int, + out: Optional[torch.Tensor] = None, +) -> torch.Tensor: ... + + +@compile_ops("module_attention_asm") +def pa_fwd_asm( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_num_blocks: int, + K_QScale: Optional[torch.Tensor], + V_QScale: Optional[torch.Tensor], + out_: Optional[torch.Tensor] = None, + high_precision: Optional[ + int + ] = 1, # [0, 1, 2] 2 is the highest precision, this is only for fp8 kvcache +) -> torch.Tensor: ... + + +@compile_ops("module_pa") +def paged_attention_rocm( + out: torch.Tensor, + exp_sums: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + fp8_out_scale: Optional[torch.Tensor], + partition_size: int, +): ... + + +@compile_ops("module_pa_ragged") +def paged_attention_ragged( + out: torch.Tensor, + workspace_buffer: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_num_partitions: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + kv_cache_layout: str, + logits_soft_cap: float, + k_scale: float, + v_scale: float, + fp8_out_scale: Optional[torch.Tensor], + partition_size: int, +): ... + + +MD_NAME = "module_mla_asm" + + +@compile_ops(MD_NAME) +def mla_decode_stage1_asm_fwd( + # [num_seqs, num_heads, head_size] + Q: torch.Tensor, + # [num_page, page_size, num_kv_heads, kv_lora_rank + qk_rope_head_dim] + KV: torch.Tensor, + # [batch_size+1] + qo_indptr: torch.Tensor, + # [batch_size+1] + kv_indptr: torch.Tensor, + # [num_page_used] + kv_page_indices: torch.Tensor, + # [batch_size] + kv_last_page_lens: torch.Tensor, + max_seqlen_q: int, + softmax_scale: float, + # [batch_size, num_kv_splits, num_heads, v_head_dim] + splitData: torch.Tensor, + # [batch_size, num_kv_splits, num_heads, 1] + splitLse: torch.Tensor, +): ... + + +@compile_ops(MD_NAME) +def mla_prefill_asm_fwd( + # [num_seqs, num_heads, head_size] + Q: torch.Tensor, + # [num_page, page_size, num_kv_heads, kv_lora_rank + qk_rope_head_dim] + KV: torch.Tensor, + # [batch_size+1] + qo_indptr: torch.Tensor, + # [batch_size+1] + kv_indptr: torch.Tensor, + # [num_page_used] + kv_page_indices: torch.Tensor, + # [batch_size] + kv_last_page_lens: torch.Tensor, + max_seqlen_q: int, + softmax_scale: float, + # [batch_size, num_kv_splits, num_heads, v_head_dim] + splitData: torch.Tensor, + # [batch_size, num_kv_splits, num_heads, 1] + splitLse: torch.Tensor, +): ... diff --git a/aiter/ops/awq_dq_asm.py b/aiter/ops/awq_dq_asm.py new file mode 100644 index 0000000000000000000000000000000000000000..188c941c2bf2d953225a8beff76a28282d2d9648 --- /dev/null +++ b/aiter/ops/awq_dq_asm.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: MIT +import torch +from torch import Tensor +from typing import Optional +from ..jit.core import compile_ops + + +MD_NAME = "module_awq_dq_asm" + + +@compile_ops("module_awq_dq_asm") +def awq_dq_asm( + out: Tensor, + mat1: Tensor, + zero: Optional[Tensor] = None, + scalar: Optional[Tensor] = None, +)->None: ... + diff --git a/aiter/ops/awq_gemm_asm.py b/aiter/ops/awq_gemm_asm.py new file mode 100644 index 0000000000000000000000000000000000000000..a7a7463fa44bf88971af49811a399ef610208cfd --- /dev/null +++ b/aiter/ops/awq_gemm_asm.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: MIT +import torch +from torch import Tensor +from typing import Optional +from ..jit.core import compile_ops + + +MD_NAME = "module_awq_gemm_asm" + + +@compile_ops("module_awq_gemm_asm") +def awq_gemm_asm( + out: Tensor, + mat1: Tensor, + mat2: Tensor, + zero: Optional[Tensor] = None, + scalar: Optional[Tensor] = None, +)->None: ... + +@compile_ops("module_awq_gemm_asm") +def awq_gemm_asm_tuning( + out: Tensor, + mat1: Tensor, + mat2: Tensor, + zero: Optional[Tensor] = None, + scalar: Optional[Tensor] = None, + solutionid: int = 0, + jsonfile: str = None, +)->None: ... + diff --git a/aiter/ops/batched_gemm_op_a8w8.py b/aiter/ops/batched_gemm_op_a8w8.py new file mode 100644 index 0000000000000000000000000000000000000000..5f6e55a107ec3fa2b6f7a715e525a8470c3cceaf --- /dev/null +++ b/aiter/ops/batched_gemm_op_a8w8.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: MIT + +import torch +from torch import Tensor +from typing import Optional +import functools +import pandas as pd +from ..jit.core import ( + compile_ops, + AITER_CORE_DIR, +) +from ..utility import dtypes +from ..jit.utils.chip_info import get_cu_num + + +@compile_ops("module_batched_gemm_a8w8", fc_name="batched_gemm_a8w8") +def batched_gemm_a8w8( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, + bias: Optional[Tensor] = None, + splitK=0, +): ... + + +@functools.lru_cache(maxsize=1024) +def compute_batched_gemm_SplitK( + M: int, N: int, K: int, tile_m: int, tile_n: int, tile_k: int +): + cu_num = get_cu_num() + tile_num = ((M + tile_m - 1) // tile_m) * ((N + tile_n - 1) // tile_n) + cusPerTile = cu_num / tile_num + splitK = 0 + while cusPerTile >= pow(2, splitK + 1) and (pow(2, splitK + 1) * tile_k) < 2 * K: + splitK += 1 + return splitK + + +@functools.lru_cache(maxsize=1024) +def get_CKBatchedGEMM_config( + B: int, + M: int, + N: int, + K: int, +): + if not hasattr(get_CKBatchedGEMM_config, "ck_batched_gemm_dict"): + ck_batched_gemm_dict = pd.read_csv( + f"{AITER_CORE_DIR}/aiter/configs/a8w8_tuned_batched_gemm.csv" + ).drop_duplicates() + get_CKBatchedGEMM_config.ck_batched_gemm_dict = ck_batched_gemm_dict.set_index( + ["B", "M", "N", "K"] + ).to_dict("index") + config = get_CKBatchedGEMM_config.ck_batched_gemm_dict.get((B, M, N, K), None) + if config != None: + mnk = config["kernelName"].split("_")[3].split("x")[1:] + config["tile_m"] = int(mnk[0]) + config["tile_n"] = int(mnk[1]) + config["tile_k"] = int(mnk[2]) + return config + + +def batched_gemm_a8w8_CK( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + bias: Optional[Tensor] = None, + dtype=dtypes.bf16, + splitK: Optional[int] = None, +): + assert dtype in [ + dtypes.bf16, + dtypes.fp16, + ], f"Output {dtype=} is currently not supported in batched_gemm_a8w8" + + b = XQ.shape[0] + m = XQ.shape[1] + n = WQ.shape[1] + k = XQ.shape[2] + ck_config = get_CKBatchedGEMM_config(b, m, n, k) + if splitK == None: + if ck_config != None: + splitK = ck_config["splitK"] + else: + splitK = 0 + Y = torch.empty(b, m, n, dtype=dtype, device=XQ.device) + return batched_gemm_a8w8(XQ, WQ, x_scale, w_scale, Y, bias, splitK) + + +@compile_ops("module_batched_gemm_a8w8_tune", fc_name="batched_gemm_a8w8_tune") +def batched_gemm_a8w8_tune( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, + kernelId: int, + splitK=0, +): ... diff --git a/aiter/ops/batched_gemm_op_bf16.py b/aiter/ops/batched_gemm_op_bf16.py new file mode 100644 index 0000000000000000000000000000000000000000..85895a8c2c21773a77e76242458c68d01be194c3 --- /dev/null +++ b/aiter/ops/batched_gemm_op_bf16.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: MIT + +import torch +from torch import Tensor +from typing import Optional +import functools +import pandas as pd +from ..jit.core import ( + compile_ops, + AITER_CORE_DIR, +) +from ..utility import dtypes +from ..jit.utils.chip_info import get_cu_num + + +@compile_ops("module_batched_gemm_bf16", fc_name="batched_gemm_bf16") +def batched_gemm_bf16( + XQ: Tensor, WQ: Tensor, out: Tensor, bias: Optional[Tensor] = None, splitK=0 +): ... + + +@functools.lru_cache(maxsize=1024) +def compute_batched_gemm_SplitK( + M: int, N: int, K: int, tile_m: int, tile_n: int, tile_k: int +): + + cu_num = get_cu_num() + tile_num = ((M + tile_m - 1) // tile_m) * ((N + tile_n - 1) // tile_n) + cusPerTile = cu_num / tile_num + splitK = 0 + while cusPerTile >= pow(2, splitK + 1) and (pow(2, splitK + 1) * tile_k) < 2 * K: + splitK += 1 + return splitK + + +@functools.lru_cache(maxsize=1024) +def get_CKBatchedGEMM_config( + B: int, + M: int, + N: int, + K: int, +): + if not hasattr(get_CKBatchedGEMM_config, "ck_batched_gemm_dict"): + ck_batched_gemm_dict = pd.read_csv( + f"{AITER_CORE_DIR}/aiter/configs/bf16_tuned_batched_gemm.csv" + ).drop_duplicates() + get_CKBatchedGEMM_config.ck_batched_gemm_dict = ck_batched_gemm_dict.set_index( + ["B", "M", "N", "K"] + ).to_dict("index") + config = get_CKBatchedGEMM_config.ck_batched_gemm_dict.get((B, M, N, K), None) + if config != None: + mnk = config["kernelName"].split("_")[2].split("x")[1:] + config["tile_m"] = int(mnk[0]) + config["tile_n"] = int(mnk[1]) + config["tile_k"] = int(mnk[2]) + return config + + +def batched_gemm_bf16_CK( + XQ: Tensor, + WQ: Tensor, + bias: Optional[Tensor] = None, + dtype=dtypes.bf16, + splitK: Optional[int] = None, +): + assert dtype in [ + dtypes.bf16, + dtypes.fp16, + ], f"Output {dtype=} is currently not supported in batched_gemm_bf16" + + b = XQ.shape[0] + m = XQ.shape[1] + n = WQ.shape[1] + k = XQ.shape[2] + ck_config = get_CKBatchedGEMM_config(b, m, n, k) + if splitK == None: + if ck_config != None: + splitK = ck_config["splitK"] + else: + splitK = 0 + Y = torch.empty(b, m, n, dtype=dtype, device=XQ.device) + return batched_gemm_bf16(XQ, WQ, Y, bias, splitK) + + +@compile_ops("module_batched_gemm_bf16_tune", fc_name="batched_gemm_bf16_tune") +def batched_gemm_bf16_tune( + XQ: Tensor, WQ: Tensor, out: Tensor, kernelId: int, splitK=0 +): ... diff --git a/aiter/ops/cache.py b/aiter/ops/cache.py new file mode 100644 index 0000000000000000000000000000000000000000..42e0c9cba5d1d1fb6b56e50af469818be212e837 --- /dev/null +++ b/aiter/ops/cache.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: MIT +from torch import Tensor +from ..jit.core import compile_ops + +MD_NAME = "module_cache" + + +@compile_ops("module_cache") +def swap_blocks(src: Tensor, dst: Tensor, block_mapping: Tensor) -> None: ... + + +@compile_ops("module_cache") +def copy_blocks( + key_caches: Tensor, value_caches: Tensor, block_mapping: Tensor +) -> None: ... + + +@compile_ops("module_cache") +def reshape_and_cache( + key: Tensor, + value: Tensor, + key_cache: Tensor, + value_cache: Tensor, + slot_mapping: Tensor, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + asm_layout: bool, +) -> None: ... + + +@compile_ops("module_cache") +def reshape_and_cache_flash( + key: Tensor, + value: Tensor, + key_cache: Tensor, + value_cache: Tensor, + slot_mapping: Tensor, + kv_cache_dtype: str, + k_scale: Tensor, + v_scale: Tensor, +) -> None: ... + + +@compile_ops("module_cache") +def reshape_and_cache_with_pertoken_quant( + key: Tensor, + value: Tensor, + key_cache: Tensor, + value_cache: Tensor, + k_dequant_scales: Tensor, + v_dequant_scales: Tensor, + slot_mapping: Tensor, + asm_layout: bool, +) -> None: ... + + +@compile_ops("module_cache") +def reshape_and_cache_with_block_quant( + key: Tensor, + value: Tensor, + key_cache: Tensor, + value_cache: Tensor, + k_dequant_scales: Tensor, + v_dequant_scales: Tensor, + slot_mapping: Tensor, + asm_layout: bool, +) -> None: ... + + +@compile_ops("module_cache") +def convert_fp8( + dst_cache: Tensor, src_cache: Tensor, scale: float, kv_cache_dtype: str +) -> None: ... diff --git a/aiter/ops/communication.py b/aiter/ops/communication.py new file mode 100644 index 0000000000000000000000000000000000000000..ace86a92bd1fa1ecbd3221add9bc4bc1537ae5e8 --- /dev/null +++ b/aiter/ops/communication.py @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: MIT +import torch +from torch import Tensor +import torch.distributed as dist +from ..dist.parallel_state import ( + ensure_model_parallel_initialized, + init_distributed_environment, + set_custom_all_reduce, + get_tp_group, + destroy_model_parallel, + destroy_distributed_environment, +) +from ..dist.utils import get_open_port, get_distributed_init_method, get_ip +import aiter +import logging + +logger = logging.getLogger("aiter") + + +def init_dist_env(world_size, rankID): + set_custom_all_reduce(True) + init_distributed_environment( + world_size=world_size, + rank=rankID, + distributed_init_method=get_distributed_init_method(get_ip(), get_open_port()), + ) + ensure_model_parallel_initialized(world_size, 1) + + # hack custom_allreduce + tp_grp = get_tp_group() + ca_comm = tp_grp.ca_comm + + # signal + signal = torch.zeros(world_size * 64, dtype=torch.int64, device=rankID) + + ca_comm.signal = signal + ca_comm.register_buffer(signal) + logger.debug(f"RANK: {rankID}/{world_size} init_dist_env...") + + +def destroy_dist_env(): + if dist.is_initialized(): + destroy_model_parallel() + destroy_distributed_environment() + torch.cuda.empty_cache() + +""" +def all_reduce_asm(inp: torch.Tensor): + tp_grp = get_tp_group() + ca = tp_grp.ca_comm + + if ca._IS_CAPTURING: + if torch.cuda.is_current_stream_capturing(): + return aiter.all_reduce_asm_( + inp, ca._ptr, ca.signal, ca.buffer, ca._IS_CAPTURING + ) + else: + # if warm up, mimic the allocation pattern + # since custom allreduce is out-of-place + return torch.empty_like(inp) + else: + # note: outside of cuda graph context, + # custom allreduce incurs a cost of cudaMemcpy, which should + # be small(<=1% of overall latency) compared to the performance + # gains of using custom kernels + return aiter.all_reduce_asm_( + inp, ca._ptr, ca.signal, ca.buffer, ca._IS_CAPTURING + ) + + +def all_reduce_rmsnorm( + input: Tensor, residual_in: Tensor, weight: Tensor, bias: Tensor, epsilon: float +): + tp_grp = get_tp_group() + ca = tp_grp.ca_comm + + return aiter.all_reduce_rmsnorm_( + input, + residual_in, + weight, + bias, + epsilon, + ca._ptr, + ca.signal, + ca.buffer, + ca._IS_CAPTURING, + ) + + +def all_reduce_rmsnorm_quant( + input: Tensor, + residual_in: Tensor, + xscale: Tensor, + weight: Tensor, + bias: Tensor, + epsilon: float, +): + tp_grp = get_tp_group() + ca = tp_grp.ca_comm + + return aiter.all_reduce_rmsnorm_quant_( + input, + residual_in, + xscale, + weight, + bias, + epsilon, + ca._ptr, + ca.signal, + ca.buffer, + ca._IS_CAPTURING, + ) +""" diff --git a/aiter/ops/custom.py b/aiter/ops/custom.py new file mode 100644 index 0000000000000000000000000000000000000000..f78ace3aaf18c6b83e3a7e8974563b98be599d5a --- /dev/null +++ b/aiter/ops/custom.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: MIT +from torch import Tensor +from ..jit.core import compile_ops + +MD_NAME = "module_custom" + + +@compile_ops("module_custom") +def wvSpltK(in_a: Tensor, in_b: Tensor, out_c: Tensor, N_in: int, CuCount: int): ... + + +@compile_ops("module_custom") +def LLMM1(in_a: Tensor, in_b: Tensor, out_c: Tensor, rows_per_block: int): ... diff --git a/aiter/ops/custom_all_reduce.py b/aiter/ops/custom_all_reduce.py new file mode 100644 index 0000000000000000000000000000000000000000..9b01a82c8c3939f045d03e5e92238a27d8992aa2 --- /dev/null +++ b/aiter/ops/custom_all_reduce.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: MIT + +from typing import List, Optional, Tuple + +import torch + +from ..jit.core import compile_ops + +MD_NAME = "module_custom_all_reduce" + + +@compile_ops("module_custom_all_reduce") +def init_custom_ar( + meta: torch.Tensor, + rank_data: torch.Tensor, + handles: List[torch.Tensor], + offsets: List[int], + rank: int, + fully_connected: bool, +) -> int: ... + + +@compile_ops("module_custom_all_reduce") +def all_reduce( + _fa: int, + inp: torch.Tensor, + out: torch.Tensor, + open_fp8_quant: bool, + reg_buffer: Optional[torch.Tensor] = None, +) -> None: ... + + +@compile_ops("module_custom_all_reduce") +def all_gather_reg(_fa: int, inp: torch.Tensor, out: torch.Tensor) -> None: ... + + +@compile_ops("module_custom_all_reduce") +def all_gather_unreg( + _fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor +) -> None: ... + + +@compile_ops("module_custom_all_reduce") +def fused_allreduce_rmsnorm( + _fa: int, + inp: torch.Tensor, + res_inp: torch.Tensor, + res_out: torch.Tensor, + out: torch.Tensor, + w: torch.Tensor, + eps: float, + reg_buffer: Optional[torch.Tensor] = None, +) -> None: ... + + +def all_reduce_asm_fake_tensor( + inp: torch.Tensor, + ca: int, + reg_sig: torch.Tensor, + reg_buffer: torch.Tensor, + isGraph: bool, +) -> torch.Tensor: + + return torch.empty_like( + inp, + dtype=inp.dtype, + device=inp.device, + ) + + +@compile_ops("module_custom_all_reduce", gen_fake=all_reduce_asm_fake_tensor) +def all_reduce_asm_( + inp: torch.Tensor, + ca: int, + reg_sig: torch.Tensor, + reg_buffer: torch.Tensor, + isGraph: bool, +) -> torch.Tensor: ... + + +def all_reduce_rmsnorm_fake_tensors( + input: torch.Tensor, + residual_in: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + epsilon: float, + ca: int, + reg_sig: torch.Tensor, + reg_buffer: torch.Tensor, + isGraph: bool, +) -> List[torch.Tensor]: + + output = torch.empty_like( + input, dtype=input.dtype, device=input.device, requires_grad=input.requires_grad + ) + + residual_out = torch.empty_like( + input, dtype=input.dtype, device=input.device, requires_grad=input.requires_grad + ) + + return [output, residual_out] + + +@compile_ops("module_custom_all_reduce", gen_fake=all_reduce_rmsnorm_fake_tensors) +def all_reduce_rmsnorm_( + input: torch.Tensor, + residual_in: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + epsilon: float, + ca: int, + reg_sig: torch.Tensor, + reg_buffer: torch.Tensor, + isGraph: bool, +) -> List[torch.Tensor]: ... + + +# def all_reduce_rmsnorm_quant_fake_tensors( +# input: torch.Tensor, +# residual_in: torch.Tensor, +# weight: torch.Tensor, +# xscale: torch.Tensor, +# bias: torch.Tensor, +# epsilon: float, +# ca: int, +# reg_sig: torch.Tensor, +# reg_buffer: torch.Tensor, +# isGraph: bool, +# ) -> List[torch.Tensor]: + +# N = input.size(-1) +# M = input.numel() // N + +# output = torch.empty_like( +# input, dtype=input.dtype, device=input.device, requires_grad=input.requires_grad +# ) + +# residual_out = torch.empty_like( +# input, dtype=input.dtype, device=input.device, requires_grad=input.requires_grad +# ) + +# y_scale = torch.empty((M, 1), dtype=torch.float32, device=input.device) + +# return [output, residual_out, y_scale] + + +# @compile_ops("module_custom_all_reduce", gen_fake=all_reduce_rmsnorm_quant_fake_tensors) +# def all_reduce_rmsnorm_quant_( +# input: torch.Tensor, +# residual_in: torch.Tensor, +# weight: torch.Tensor, +# xscale: torch.Tensor, +# bias: torch.Tensor, +# epsilon: float, +# ca: int, +# reg_sig: torch.Tensor, +# reg_buffer: torch.Tensor, +# isGraph: bool, +# ) -> List[torch.Tensor]: ... + + +@compile_ops("module_custom_all_reduce") +def dispose(_fa: int) -> None: ... + + +@compile_ops("module_custom_all_reduce") +def meta_size() -> int: ... + + +@compile_ops("module_custom_all_reduce") +def register_buffer( + _fa: int, t: torch.Tensor, handles: List[torch.Tensor], offsets: List[int] +) -> None: ... + + +# def gen_get_graph_buffer_ipc_meta_fake_tensors(_fa: int) -> List[torch.Tensor]: + +# handle_sz = 64 # sizeof(hipIpcMemHandle_t) is 64 byte +# num_buffers = 4 # ??? +# handles = torch.empty((handle_sz * num_buffers,), dtype=torch.uint8, device="cuda") + +# offset_tensor = torch.empty((num_buffers,), dtype=torch.int64, device="cuda") + +# return [handles, offset_tensor] + + +@compile_ops("module_custom_all_reduce") +def get_graph_buffer_ipc_meta(_fa: int) -> Tuple[torch.Tensor, torch.Tensor]: ... + + +@compile_ops("module_custom_all_reduce") +def register_graph_buffers( + _fa: int, handles: List[torch.Tensor], offsets: List[torch.Tensor] +) -> None: ... + + +@compile_ops("module_custom_all_reduce") +def allocate_meta_buffer(size: int) -> torch.Tensor: ... + + +# def get_meta_buffer_ipc_handle_fake(inp: torch.Tensor) -> torch.Tensor: +# handle_size = 64 +# if not inp.is_cuda: +# raise RuntimeError("Input tensor must be on CUDA device") + +# return torch.empty(handle_size, dtype=torch.uint8, device=inp.device) + + +@compile_ops("module_custom_all_reduce") +def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor: ... \ No newline at end of file diff --git a/aiter/ops/enum.py b/aiter/ops/enum.py new file mode 100644 index 0000000000000000000000000000000000000000..d4e054d9617d8460957e308107accd5c1e6e298c --- /dev/null +++ b/aiter/ops/enum.py @@ -0,0 +1,15 @@ +from ..jit.core import compile_ops + +# from enum import Enum as Enum +Enum = int + + +@compile_ops("module_aiter_enum", "ActivationType") +def _ActivationType(dummy: int) -> int: ... + + +@compile_ops("module_aiter_enum", "QuantType") +def _QuantType(dummy: int) -> int: ... + +ActivationType = type(_ActivationType(0)) +QuantType = type(_QuantType(0)) diff --git a/aiter/ops/fused_qk_norm_mrope_cache_quant.py b/aiter/ops/fused_qk_norm_mrope_cache_quant.py new file mode 100644 index 0000000000000000000000000000000000000000..88ab384c26fb72072b59716c9de615c601c225e6 --- /dev/null +++ b/aiter/ops/fused_qk_norm_mrope_cache_quant.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: MIT + +from torch import Tensor +from ..jit.core import compile_ops +from typing import List, Optional + + +@compile_ops("module_fused_qk_norm_mrope_cache_quant_shuffle") +def fused_qk_norm_mrope_3d_cache_pts_quant_shuffle( + qkv: Tensor, + qw: Tensor, + kw: Tensor, + cos_sin: Tensor, + positions: Tensor, + num_tokens: int, + num_heads_q: int, + num_heads_k: int, + num_heads_v: int, + head_size: int, + is_neox_style: bool, + mrope_section_: List[int], + is_interleaved: bool, + eps: float, + q_out: Tensor, + k_cache: Tensor, + v_cache: Tensor, + slot_mapping: Tensor, + per_tensor_k_scale: Tensor, + per_tensor_v_scale: Tensor, + k_out: Optional[Tensor], + v_out: Optional[Tensor], + return_kv: bool, + use_shuffle_layout: bool, + block_size: int, + x: int, + rotary_dim: int = 0, +) -> None: ... diff --git a/aiter/ops/fused_qk_norm_rope_cache_quant.py b/aiter/ops/fused_qk_norm_rope_cache_quant.py new file mode 100644 index 0000000000000000000000000000000000000000..500be650737a9517e4117c04bb60900ce163c002 --- /dev/null +++ b/aiter/ops/fused_qk_norm_rope_cache_quant.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: MIT + +import torch +from torch import Tensor +from ..jit.core import compile_ops +from typing import Optional + + +@compile_ops("module_fused_qk_norm_rope_cache_quant_shuffle") +def fused_qk_norm_rope_cache_quant_shuffle( + qkv: Tensor, + num_heads_q: int, + num_heads_k: int, + num_heads_v: int, + head_dim: int, + eps: float, + qw: Tensor, + kw: Tensor, + cos_sin_cache: Tensor, + is_neox_style: bool, + pos_ids: Tensor, + k_cache: Tensor, + v_cache: Tensor, + slot_mapping: Tensor, + kv_cache_dtype: str, + k_scale: Tensor, + v_scale: Tensor, +) -> None: ... + + +def gen_fused_qk_rmsnorm_fake_tensor( + q: Tensor, + q_weight: Tensor, + q_eps: float, + k: Tensor, + k_weight: Tensor, + k_eps: float, + q_out: Optional[Tensor], + k_out: Optional[Tensor], +) -> tuple[Tensor, Tensor]: + if q_out is None: + q_out = torch.empty_like(q, dtype=q.dtype, device=q.device) + if k_out is None: + k_out = torch.empty_like(k, dtype=k.dtype, device=k.device) + return q_out, k_out + + +@compile_ops("module_fused_qk_norm_rope_cache_quant_shuffle") +def fused_qk_norm_rope_cache_block_quant_shuffle( + qkv: Tensor, + num_heads_q: int, + num_heads_k: int, + num_heads_v: int, + head_dim: int, + eps: float, + qw: Tensor, + kw: Tensor, + cos_sin_cache: Tensor, + is_neox_style: bool, + pos_ids: Tensor, + k_cache: Tensor, + v_cache: Tensor, + slot_mapping: Tensor, + cu_q_len: Tensor, + kv_cache_dtype: str, + k_scale: Tensor, + v_scale: Tensor, + max_tokens_per_batch: int = 0, +) -> None: ... + + +@compile_ops("module_fused_qk_norm_rope_cache_quant_shuffle") +def fused_qk_norm_rope_cache_pts_quant_shuffle( + qkv: Tensor, + qw: Tensor, + kw: Tensor, + cos_sin: Tensor, + positions: Tensor, + num_tokens: int, + num_heads_q: int, + num_heads_k: int, + num_heads_v: int, + head_size: int, + is_neox_style: bool, + eps: float, + q_out: Tensor, + k_cache: Tensor, + v_cache: Tensor, + slot_mapping: Tensor, + per_tensor_k_scale: Tensor, + per_tensor_v_scale: Tensor, + k_out: Optional[Tensor], + v_out: Optional[Tensor], + return_kv: bool, + use_shuffle_layout: bool, + block_size: int, + x: int, + rotary_dim: int = 0, +) -> None: ... + + +@compile_ops("module_fused_qk_norm_rope_cache_quant_shuffle") +def fused_qk_norm_rope_2way( + q0: Tensor, + k0: Tensor, + q1: Tensor, + k1: Tensor, + w_q0: Tensor, + w_k0: Tensor, + w_q1: Tensor, + w_k1: Tensor, + cos_sin0: Tensor, + cos_sin1: Tensor, + batch_size: int, + num_tokens0: int, + num_tokens1: int, + num_heads_q: int, + num_heads_k: int, + head_size: int, + is_interleaved: bool, + eps: float, + out_q01: Tensor, + out_k01: Tensor, +) -> None: ... diff --git a/aiter/ops/gemm_op_a8w8.py b/aiter/ops/gemm_op_a8w8.py new file mode 100644 index 0000000000000000000000000000000000000000..96fc8572fadd398f13cfeeaa9e0dc5a52298c9e1 --- /dev/null +++ b/aiter/ops/gemm_op_a8w8.py @@ -0,0 +1,232 @@ +# SPDX-License-Identifier: MIT + +import torch +from torch import Tensor +from typing import Optional +import functools +import pandas as pd +from ..jit.core import ( + compile_ops, + AITER_CORE_DIR, +) +from ..utility import dtypes +from ..jit.utils.chip_info import get_cu_num + + +@compile_ops("module_gemm_a8w8", fc_name="gemm_a8w8") +def gemm_a8w8( + XQ: torch.Tensor, + WQ: torch.Tensor, + x_scale: torch.Tensor, + w_scale: torch.Tensor, + Out: torch.Tensor, + bias: Optional[torch.Tensor] = None, + splitK: int = 0, +) -> torch.Tensor: ... + + +@compile_ops("module_gemm_a8w8_asm", fc_name="gemm_a8w8_asm") +def gemm_a8w8_asm( + XQ: Tensor, # A:[M, K] i8 + WQ: Tensor, # B:[N, K] i8 -> shuffle layout(32,16) + x_scale: Tensor, # A_scale:[M, 1] f32 + w_scale: Tensor, # B_scale:[1, N] f32 + Out: Tensor, # Out:[M, N] bf16 + bias: Tensor, # bias:[1, N] f32 + sub_m: Optional[int] = 128, + sub_n: Optional[int] = 128, + pad_a: Optional[int] = 0, + pad_b: Optional[int] = 0, + pad_c: Optional[int] = 0, + splitK: Optional[int] = 0, +) -> torch.Tensor: ... + + +@compile_ops("module_gemm_a8w8_blockscale", fc_name="gemm_a8w8_blockscale") +def gemm_a8w8_blockscale( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, +): ... + + +@compile_ops("module_gemm_a8w8_blockscale_asm", fc_name="flatmm_a8w8_blockscale_asm") +def flatmm_a8w8_blockscale_asm( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, +): ... + + +@functools.lru_cache(maxsize=1024) +def compute_gemm_SplitK(M: int, N: int, K: int, tile_m: int, tile_n: int, tile_k: int): + cu_num = get_cu_num() + tile_num = ((M + tile_m - 1) // tile_m) * ((N + tile_n - 1) // tile_n) + cusPerTile = cu_num / tile_num + splitK = 0 + while cusPerTile >= pow(2, splitK + 1) and (pow(2, splitK + 1) * tile_k) < 2 * K: + splitK += 1 + return splitK + + +@functools.lru_cache(maxsize=1024) +def get_CKGEMM_config( + M: int, + N: int, + K: int, +): + if not hasattr(get_CKGEMM_config, "ckgemm_dict"): + ckgemm_dict = pd.read_csv( + f"{AITER_CORE_DIR}/aiter/configs/a8w8_tuned_gemm.csv" + ).drop_duplicates() + get_CKGEMM_config.ckgemm_dict = ckgemm_dict.set_index(["M", "N", "K"]).to_dict( + "index" + ) + config = get_CKGEMM_config.ckgemm_dict.get((M, N, K), None) + if config != None: + mnk = config["kernelName"].split("_")[2].split("x")[1:] + config["tile_m"] = int(mnk[0]) + config["tile_n"] = int(mnk[1]) + config["tile_k"] = int(mnk[2]) + return config + + +@functools.lru_cache(maxsize=1024) +def get_ASMGEMM_config(M: int, N: int, K: int, bias: bool, dtype: torch.dtype): + if not hasattr(get_ASMGEMM_config, "asmgemm_dict"): + asmGemmDictDf = pd.read_csv( + f"{AITER_CORE_DIR}/aiter/configs/asm_a8w8_gemm.csv" + ).drop_duplicates() + asmGemmDictDf.bias = asmGemmDictDf.bias.apply( + lambda s: True if s in ["True", 1, "true"] else False + ) + get_ASMGEMM_config.asmgemm_dict = asmGemmDictDf.set_index( + ["M", "N", "K", "bias", "outdtype"] + ).to_dict("index") + return get_ASMGEMM_config.asmgemm_dict.get((M, N, K, bias, str(dtype)), None) + + +def gemm_a8w8_ASM( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + bias: Tensor, + dtype=dtypes.bf16, + check=False, +): + """ + Notes for use gemm_a8w8_ASM: + 1. WQ(weight) must be shuffle, you can use \ + 'weightshuffle = shuffle_weight(weight,layout=(32,16))' + 2. Use asm gemm must give bias, if not have bias, please give \ + 'bias=torch.zeros(n,dtype=dtypes.fp32,device='cuda')' + """ + if check: + assert dtype in [ + dtypes.bf16, + ], f"Output {dtype=} is currently not supported in gemm_a8w8_ASM" + assert ( + x_scale.dtype == dtypes.fp32 and w_scale.dtype == dtypes.fp32 + ), f"{x_scale.dtype=} or {w_scale.dtype=} must be dtypes.fp32" + m = XQ.shape[0] + n = WQ.shape[0] + k = XQ.shape[-1] + if ( + x_scale.dtype == dtypes.fp32 + and w_scale.dtype == dtypes.fp32 + and (asm_config := get_ASMGEMM_config(m, n, k, bias != None, dtype)) != None + ): + assert ( + bias != None + ), "Use asm gemm must give bias, please give a \ + bias=torch.zeros(n,dtype=dtypes.fp32,device='cuda')" + splitK = asm_config["splitK"] + Y = torch.empty(m, n, dtype=dtype, device=XQ.device) + return gemm_a8w8_asm(XQ, WQ, x_scale, w_scale, Y, bias, splitK=splitK) + return None + + +def gemm_a8w8_CK( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + bias: Optional[Tensor] = None, + dtype=dtypes.bf16, + splitK: Optional[int] = None, +): + assert dtype in [ + dtypes.bf16, + dtypes.fp16, + ], f"Output {dtype=} is currently not supported in gemm_a8w8" + m = XQ.shape[0] + n = WQ.shape[0] + k = XQ.shape[-1] + ck_config = get_CKGEMM_config(m, n, k) + if splitK == None: + if ck_config != None: + splitK = ck_config["splitK"] + else: + splitK = 0 + Y = torch.empty(m, n, dtype=dtype, device=XQ.device) + return gemm_a8w8(XQ, WQ, x_scale, w_scale, Y, bias, splitK) + + +def gemm_a8w8_blockscale_CK( + XQ: Tensor, WQ: Tensor, x_scale: Tensor, w_scale: Tensor, dtype=dtypes.bf16 +): + assert dtype in [ + dtypes.bf16, + dtypes.fp16, + ], f"Output {dtype=} is currently not supported in gemm_a8w8" + m = XQ.shape[0] + n = WQ.shape[0] + k = XQ.shape[-1] + Y = torch.empty(m, n, dtype=dtype, device=XQ.device) + return gemm_a8w8_blockscale(XQ, WQ, x_scale, w_scale, Y) + + +def flatmm_a8w8_blockscale_ASM( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + dtype=dtypes.fp16, +): + assert dtype in [ + dtypes.fp16, + ], f"Output {dtype=} is currently not supported in gemm_a8w8" + m = XQ.shape[0] + n = WQ.shape[0] + k = XQ.shape[-1] + Y = torch.empty(m, n, dtype=dtype, device=XQ.device) + return flatmm_a8w8_blockscale_asm(XQ, WQ, x_scale, w_scale, Y) + + +@compile_ops("module_gemm_a8w8_tune", fc_name="gemm_a8w8_tune") +def gemm_a8w8_tune( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, + kernelId: int, + splitK=0, +): ... + + +@compile_ops("module_gemm_a8w8_blockscale_tune", fc_name="gemm_a8w8_blockscale_tune") +def gemm_a8w8_blockscale_tune( + XQ: Tensor, + WQ: Tensor, + x_scale: Tensor, + w_scale: Tensor, + out: Tensor, + kernelId: int, + splitK=0, +): ... diff --git a/aiter/ops/gradlib.py b/aiter/ops/gradlib.py new file mode 100644 index 0000000000000000000000000000000000000000..336d427e3a8e87688c2f5705ce901b63e9f96365 --- /dev/null +++ b/aiter/ops/gradlib.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: MIT + +import torch +from typing import Optional +from ..jit.core import compile_ops + + +@compile_ops("module_hipbsolgemm") +def hipb_create_extension() -> None: ... + + +@compile_ops("module_hipbsolgemm") +def hipb_destroy_extension() -> None: ... + + +def gen_hipb_mm_fake_tensor( + mat1: torch.Tensor, + mat2: torch.Tensor, + solution_index: int, + bias: Optional[torch.Tensor] = None, + out_dtype: Optional[torch.dtype] = None, + scaleA: Optional[torch.Tensor] = None, + scaleB: Optional[torch.Tensor] = None, + scaleOut: Optional[torch.Tensor] = None, + scaleType: Optional[int] = None, +): + mat1_sizes = mat1.size() + mat2_sizes = mat2.size() + in_dtype = mat1.dtype + out_dtype = out_dtype if out_dtype is not None else in_dtype + result = torch.empty( + (mat1_sizes[0], mat2_sizes[1]), dtype=out_dtype, device=mat1.device + ) + + return result + + +@compile_ops("module_hipbsolgemm", gen_fake=gen_hipb_mm_fake_tensor) +def hipb_mm( + mat1: torch.Tensor, + mat2: torch.Tensor, + solution_index: int, + bias: Optional[torch.Tensor] = None, + out_dtype: Optional[torch.dtype] = None, + scaleA: Optional[torch.Tensor] = None, + scaleB: Optional[torch.Tensor] = None, + scaleOut: Optional[torch.Tensor] = None, + scaleType: Optional[int] = None, +) -> torch.Tensor: ... + + +@compile_ops("module_hipbsolgemm") +def hipb_findallsols( + mat1: torch.Tensor, + mat2: torch.Tensor, + bias: Optional[torch.Tensor] = None, + out_dtype: Optional[torch.dtype] = None, + scaleA: Optional[torch.Tensor] = None, + scaleB: Optional[torch.Tensor] = None, + scaleC: Optional[torch.Tensor] = None, + scaleType: Optional[int] = None, +) -> list[int]: ... + + +@compile_ops("module_hipbsolgemm") +def getHipblasltKernelName() -> None: ... + + +@compile_ops("module_rocsolgemm") +def rocb_create_extension() -> None: ... + + +@compile_ops("module_rocsolgemm") +def rocb_destroy_extension() -> None: ... + + +def gen_rocb_mm_fake_tensor( + arg0: torch.Tensor, arg1: torch.Tensor, arg2: int +) -> torch.Tensor: + mat1_sizes = arg0.size() + mat2_sizes = arg0.size() + in_dtype = arg0.dtype + result = torch.empty( + (mat1_sizes[0], mat2_sizes[1]), dtype=in_dtype, device=arg0.device + ) + + return result + + +@compile_ops("module_rocsolgemm", gen_fake=gen_rocb_mm_fake_tensor) +def rocb_mm(arg0: torch.Tensor, arg1: torch.Tensor, arg2: int) -> torch.Tensor: ... + + +@compile_ops("module_rocsolgemm") +def rocb_findallsols(arg0: torch.Tensor, arg1: torch.Tensor) -> list[int]: ... diff --git a/aiter/ops/mha.py b/aiter/ops/mha.py new file mode 100644 index 0000000000000000000000000000000000000000..d9cdab235de7026ebe4799a9e512bdd3510eaff7 --- /dev/null +++ b/aiter/ops/mha.py @@ -0,0 +1,1628 @@ +# SPDX-License-Identifier: MIT + +from torch import Tensor, Generator +from typing import Optional, Tuple +from ..jit.core import compile_ops, CK_DIR, AITER_CSRC_DIR +from ..utility import dtypes +import torch + + +@compile_ops("module_mha_fwd", fc_name="mha_fwd") +def mha_fwd( + q: Tensor, + k: Tensor, + v: Tensor, + dropout_p: float, + softmax_scale: float, + is_causal: bool, + window_size_left: int, + window_size_right: int, + return_softmax_lse: bool, + return_dropout_randval: bool, + out: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + gen: Optional[Generator] = None, +): ... + + +@compile_ops("module_fmha_v3_fwd", fc_name="fmha_v3_fwd") +def fmha_v3_fwd( + q: Tensor, + k: Tensor, + v: Tensor, + dropout_p: float, + softmax_scale: float, + is_causal: bool, + window_size_left: int, + window_size_right: int, + return_softmax_lse: bool, + return_dropout_randval: bool, + out: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + gen: Optional[Generator] = None, +): ... + + +@compile_ops("module_mha_varlen_fwd", fc_name="mha_varlen_fwd") +def mha_varlen_fwd( + q: Tensor, + k: Tensor, + v: Tensor, + cu_seqlens_q: Tensor, + cu_seqlens_k: Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + softmax_scale: float, + logits_soft_cap: float, + zero_tensors: bool, + is_causal: bool, + window_size_left: int, + window_size_right: int, + return_softmax_lse: bool, + return_dropout_randval: bool, + out: Optional[Tensor] = None, + block_table: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + gen: Optional[Generator] = None, +) -> list[Tensor]: ... + + +@compile_ops("module_mha_bwd", fc_name="mha_bwd") +def mha_bwd( + dout: Tensor, + q: Tensor, + k: Tensor, + v: Tensor, + out: Tensor, + softmax_lse: Tensor, + dropout_p: float, + softmax_scale: float, + is_causal: bool, + window_size_left: int, + window_size_right: int, + deterministic: bool, + dq: Optional[Tensor] = None, + dk: Optional[Tensor] = None, + dv: Optional[Tensor] = None, + dbias: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + rng_state: Optional[Tensor] = None, + gen: Optional[Generator] = None, +): ... + + +@compile_ops("module_fmha_v3_bwd", fc_name="fmha_v3_bwd") +def fmha_v3_bwd( + dout: Tensor, + q: Tensor, + k: Tensor, + v: Tensor, + out: Tensor, + softmax_lse: Tensor, + dropout_p: float, + softmax_scale: float, + is_causal: bool, + window_size_left: int, + window_size_right: int, + deterministic: bool, + is_v3_atomic_fp32: bool, + how_v3_bf16_cvt: int, + dq: Optional[Tensor] = None, + dk: Optional[Tensor] = None, + dv: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + rng_state: Optional[Tensor] = None, + gen: Optional[Generator] = None, +): ... + + +@compile_ops("module_mha_varlen_bwd", fc_name="mha_varlen_bwd") +def mha_varlen_bwd( + dout: Tensor, + q: Tensor, + k: Tensor, + v: Tensor, + out: Tensor, + softmax_lse: Tensor, + cu_seqlens_q: Tensor, + cu_seqlens_k: Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + window_size_left: int, + window_size_right: int, + deterministic: bool, + dq: Optional[Tensor] = None, + dk: Optional[Tensor] = None, + dv: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + rng_state: Optional[Tensor] = None, + gen: Optional[Generator] = None, + custom_build_args: Optional[dict] = None, +): ... + + +@compile_ops("module_fmha_v3_varlen_bwd", fc_name="fmha_v3_varlen_bwd") +def fmha_v3_varlen_bwd( + dout: Tensor, + q: Tensor, + k: Tensor, + v: Tensor, + out: Tensor, + softmax_lse: Tensor, + cu_seqlens_q: Tensor, + cu_seqlens_k: Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + window_size_left: int, + window_size_right: int, + deterministic: bool, + is_v3_atomic_fp32: bool, + how_v3_bf16_cvt: int, + dq: Optional[Tensor] = None, + dk: Optional[Tensor] = None, + dv: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + rng_state: Optional[Tensor] = None, + gen: Optional[Generator] = None, +): ... + + +def maybe_contiguous(x): + return x.contiguous() if x is not None and x.stride(-1) != 1 else x + + +def _flash_attn_forward( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + dropout_p: float, + softmax_scale: float, + causal: bool, + window_size_left: int, + window_size_right: int, + bias: Optional[torch.Tensor], + alibi_slopes: Optional[torch.Tensor], + return_lse: bool, + return_softmax: bool, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + (_, seqlen_q, _, _) = q.shape + # causal=true is the same as causal=false in this case + if seqlen_q == 1 and alibi_slopes is None: + causal = False + + md_name = "mha_fwd" + filter = "*" + if q.dtype == dtypes.fp16: + md_name += "_fp16" + filter += "fp16*" + elif q.dtype == dtypes.bf16: + md_name += "_bf16" + filter += "bf16*" + if bias is not None: + md_name += "_bias" + filter += "_bias*" + elif alibi_slopes is not None: + md_name += "_alibi" + filter += "_alibi*" + else: + md_name += "_nbias" + filter += "_nbias*" + if not causal and window_size_left == -1 and window_size_right == -1: + md_name += "_nmask" + filter += "_nmask*" + else: + md_name += "_mask" + filter += "_mask*" + if return_lse: + md_name += "_lse" + filter += "_lse*" + else: + md_name += "_nlse" + filter += "_nlse*" + if dropout_p == 0: + md_name += "_ndropout" + filter += "_ndropout*" + else: + md_name += "_dropout" + filter += "_dropout*" + + blob_gen_cmd = [ + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd " + "--receipt 100 --filter {} --output_dir {{}}".format(filter), + f"{AITER_CSRC_DIR}/cpp_itfs/mha_fwd_generate.py --receipt 2 --output_dir {{}}", + ] + + (_, seqlen_q, nhead_q, hdim_q) = q.shape + (_, seqlen_k, nhead_k, hdim_v) = v.shape + + # mask + window_size_left = -1 if window_size_left >= seqlen_k else window_size_left + window_size_right = -1 if window_size_right >= seqlen_k else window_size_right + mask = causal and window_size_left == -1 # causal mask + nmask = not causal and window_size_left == -1 and window_size_right == -1 # no mask + + def can_impl_fmha_v3_fwd(): + # basic + ret = alibi_slopes is None + ret &= bias is None + ret &= dropout_p == 0.0 + ret &= seqlen_q == seqlen_k + ret &= seqlen_q % 256 == 0 + ret &= hdim_q == hdim_v + ret &= hdim_q == 128 + ret &= nhead_q % nhead_k == 0 + ret &= mask or nmask + ret &= return_lse + ret &= "gfx946" in torch.cuda.get_device_properties("cuda").gcnArchName + return ret + + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + if can_impl_fmha_v3_fwd(): + out, softmax_lse, S_dmask, rng_state = fmha_v3_fwd( + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size_left, + window_size_right, + return_lse, + return_softmax, + None, + bias, + alibi_slopes, + None, + ) + else: + out, softmax_lse, S_dmask, rng_state = mha_fwd( + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size_left, + window_size_right, + return_lse, + return_softmax, + None, + bias, + alibi_slopes, + None, + custom_build_args={"md_name": md_name, "blob_gen_cmd": blob_gen_cmd}, + ) + return out, softmax_lse, S_dmask, rng_state + + +def _flash_attn_backward( + dout: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + out: torch.Tensor, + softmax_lse: torch.Tensor, + dq: Optional[torch.Tensor], + dk: Optional[torch.Tensor], + dv: Optional[torch.Tensor], + dbias: Optional[torch.Tensor], + dropout_p: float, + softmax_scale: float, + causal: bool, + window_size_left: int, + window_size_right: int, + bias: Optional[torch.Tensor], + alibi_slopes: Optional[torch.Tensor], + deterministic: bool, + rng_state: Optional[torch.Tensor] = None, + is_v3_atomic_fp32: Optional[bool] = True, + how_v3_bf16_cvt: Optional[int] = 1, +) -> torch.Tensor: + md_name = "mha_bwd" + filter1 = "*" # get_bwd_dot_do_o_blobs() + filter2 = "*" # get_bwd_convert_dq_blobs() + filter3 = "*" # get_bwd_dq_dk_dv_blobs() + if q.dtype == dtypes.fp16: + md_name += "_fp16" + filter1 += "fp16*" + filter2 += "fp16*" + filter3 += "fp16*" + elif q.dtype == dtypes.bf16: + md_name += "_bf16" + filter1 += "bf16*" + filter2 += "bf16*" + filter3 += "bf16*" + if bias is not None: + md_name += "_bias" + filter3 += "_bias*" + elif alibi_slopes is not None: + md_name += "_alibi" + filter3 += "_alibi*" + else: + md_name += "_nbias" + filter3 += "_nbias*" + if dbias is not None: + md_name += "_dbias" + filter3 += "_dbias*" + else: + md_name += "_ndbias" + filter3 += "_ndbias*" + if not causal and window_size_left == -1 and window_size_right == -1: + md_name += "_nmask" + filter3 += "_nmask*" + else: + md_name += "_mask" + filter3 += "_mask*" + if dropout_p == 0: + md_name += "_ndropout" + filter3 += "_ndropout*" + else: + md_name += "_dropout" + filter3 += "_dropout*" + if deterministic: + md_name += "_deterministic" + filter2 += "_deterministic*" + filter3 += "_deterministic*" + else: + md_name += "_ndeterministic" + filter2 += "_ndeterministic*" + filter3 += "_ndeterministic*" + + filter = f"{filter1}@{filter2}@{filter3}" + + blob_gen_cmd = [ + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d bwd " + "--receipt 300 --filter {} --output_dir {{}}".format(filter), + f"{AITER_CSRC_DIR}/cpp_itfs/mha_bwd_generate.py --receipt 1 --output_dir {{}}", + ] + + (_, seqlen_q, nhead_q, hdim_q) = q.shape + (_, seqlen_k, nhead_k, hdim_v) = v.shape + + batch_stride_q = q.stride(0) + stride_q = q.stride(1) + nhead_stride_q = q.stride(2) + + batch_stride_k = k.stride(0) + stride_k = k.stride(1) + nhead_stride_k = k.stride(2) + + batch_stride_v = v.stride(0) + stride_v = v.stride(1) + nhead_stride_v = v.stride(2) + + batch_stride_do = dout.stride(0) + stride_do = dout.stride(1) + nhead_stride_do = dout.stride(2) + + batch_stride_dk = dk.stride(0) + nhead_stride_dk = dk.stride(2) + + batch_stride_dv = dv.stride(0) + nhead_stride_dv = dv.stride(2) + + # mask + window_size_left = -1 if window_size_left >= seqlen_k else window_size_left + window_size_right = -1 if window_size_right >= seqlen_k else window_size_right + mask = causal and window_size_left == -1 # causal mask + nmask = not causal and window_size_left == -1 and window_size_right == -1 # no mask + swa = not causal and (window_size_left > 0 or window_size_right > 0) + + def np(): + # bwd_hd128_bf16_a16_rtne + # bwd_hd128_bf16_a16_rtna + # bwd_hd128_bf16_a16_rtz + # bwd_hd128_bf16_a32_rtne + # bwd_hd128_bf16_a32_rtna + # bwd_hd128_bf16_a32_rtz + # bwd_hd128_bf16_causal_a16_rtne + # bwd_hd128_bf16_causal_a16_rtna + # bwd_hd128_bf16_causal_a16_rtz + # bwd_hd128_bf16_causal_a32_rtne + # bwd_hd128_bf16_causal_a32_rtna + # bwd_hd128_bf16_causal_a32_rtz + # bwd_hd128_fp16_a16 + # bwd_hd128_fp16_a32 + # bwd_hd128_fp16_causal_a16 + # bwd_hd128_fp16_causal_a32 + # bwd_hd64_bf16_a16_rtne + # bwd_hd64_bf16_a16_rtna + # bwd_hd64_bf16_a16_rtz + # bwd_hd64_bf16_causal_a16_rtne + # bwd_hd64_bf16_causal_a16_rtna + # bwd_hd64_bf16_causal_a16_rtz + # bwd_hd64_fp16_a16 + # bwd_hd64_fp16_causal_a16 + npssk = seqlen_q == seqlen_k + npssk &= seqlen_k % 64 == 0 + npssk &= stride_q == stride_do + npssk &= nhead_stride_q == nhead_stride_do + npssk &= batch_stride_q == batch_stride_do + npssk &= stride_k == stride_v + npssk &= nhead_stride_k == nhead_stride_v + npssk &= batch_stride_k == batch_stride_v + npssk &= nhead_stride_k == nhead_stride_dk + npssk &= nhead_stride_v == nhead_stride_dv + npssk &= (batch_stride_dk / batch_stride_k) == (nhead_q / nhead_k) + npssk &= (batch_stride_dv / batch_stride_v) == (nhead_q / nhead_k) + + hd128_case = (hdim_q == 128) and npssk + + hd64_case = (hdim_q == 64 and is_v3_atomic_fp32 == False) and npssk + + ret = hd128_case or hd64_case + + return ret + + def pssk(): + # only for hd64 a32 causal/no causal, fp16/bf16-rtne/rtna/rtz cases + # FIXME: Currently we only support mask_type == mask_enum::no_mask or causal mask with seqlen_q == seqlen_k + # Because python side only support mask_enum::bottom_right + # However v3 kernel only support mask_enum::top_left + # bwd_hd64_bf16_a32_rtne_pssk + # bwd_hd64_bf16_a32_rtna_pssk + # bwd_hd64_bf16_a32_rtz_pssk + # bwd_hd64_bf16_causal_a32_rtne_pssk + # bwd_hd64_bf16_causal_a32_rtna_pssk + # bwd_hd64_bf16_causal_a32_rtz_pssk + # bwd_hd64_fp16_a32_pssk + # bwd_hd64_fp16_causal_a32_pssk + ret = ( + is_v3_atomic_fp32 == True + ) # nhead_stride_dq_acc >= stride_dq_acc must be guaranteed + ret &= hdim_q == 64 + ret &= nmask or ( + mask and seqlen_q == seqlen_k + ) # TODO: or (seqlen_q != seqlen_k and mask_type == top_left) + + return ret + + def pddv(): + # only for a16 causal/no causal, fp16/bf16-rtne/rtna/rtz cases + # bwd_hd128_bf16_a16_rtne_pddv + # bwd_hd128_bf16_a16_rtna_pddv + # bwd_hd128_bf16_a16_rtz_pddv + # bwd_hd128_bf16_causal_a16_rtne_pddv + # bwd_hd128_bf16_causal_a16_rtna_pddv + # bwd_hd128_bf16_causal_a16_rtz_pddv + # bwd_hd128_fp16_a16_pddv + # bwd_hd128_fp16_causal_a16_pddv + ret = is_v3_atomic_fp32 == False + ret &= hdim_q > 64 and hdim_q < 128 + ret &= seqlen_q == seqlen_k + ret &= seqlen_k % 64 == 0 + ret &= stride_q == stride_do + ret &= nhead_stride_q == nhead_stride_do + ret &= batch_stride_q == batch_stride_do + ret &= stride_k == stride_v + ret &= nhead_stride_k == nhead_stride_v + ret &= batch_stride_k == batch_stride_v + ret &= nhead_stride_k == nhead_stride_dk + ret &= nhead_stride_v == nhead_stride_dv + ret &= (batch_stride_dk / batch_stride_k) == (nhead_q / nhead_k) + ret &= (batch_stride_dv / batch_stride_v) == (nhead_q / nhead_k) + + return ret + + def psskddv(): + # only for a32 causal/no causal, fp16/bf16-rtne/rtna/rtz cases + # bwd_hd128_bf16_a32_rtne_psskddv + # bwd_hd128_bf16_a32_rtna_psskddv + # bwd_hd128_bf16_a32_rtz_psskddv + # bwd_hd128_bf16_causal_a32_rtne_psskddv + # bwd_hd128_bf16_causal_a32_rtna_psskddv + # bwd_hd128_bf16_causal_a32_rtz_psskddv + # bwd_hd128_fp16_a32_psskddv + # bwd_hd128_fp16_causal_a32_psskddv + # bwd_hd192_fp16_a32_psskddv + # bwd_hd192_fp16_causal_a32_psskddv + # bwd_hd192_bf16_a32_rtne_psskddv + # bwd_hd192_bf16_a32_rtna_psskddv + # bwd_hd192_bf16_a32_rtz_psskddv + # bwd_hd192_bf16_causal_a32_rtne_psskddv + # bwd_hd192_bf16_causal_a32_rtna_psskddv + # bwd_hd192_bf16_causal_a32_rtz_psskddv + ret = is_v3_atomic_fp32 == True + ret &= hdim_q > 64 and hdim_q <= 192 + ret &= ( + nmask + or (mask and seqlen_q == seqlen_k) + or (swa and hdim_q > 64 and hdim_q <= 128) + ) # TODO: or (seqlen_q != seqlen_k and mask_type == top_left) + + return ret + + def can_impl_fmha_v3_bwd(): + # basic + ret = alibi_slopes is None + ret &= bias is None + ret &= dbias is None + ret &= dropout_p == 0.0 + ret &= deterministic == False + ret &= hdim_q == hdim_v + ret &= nhead_q % nhead_k == 0 + ret &= hdim_q >= 64 and hdim_q <= 192 and hdim_q % 8 == 0 + ret &= mask or nmask or swa + ret &= np() or pssk() or pddv() or psskddv() + return ret + + # dq, dk, dv are allocated by us so they should already be contiguous + dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)] + if can_impl_fmha_v3_bwd(): + ( + dq, + dk, + dv, + softmax_d, + ) = fmha_v3_bwd( + dout, + q, + k, + v, + out, + softmax_lse, + dropout_p, + softmax_scale, + causal, + window_size_left, + window_size_right, + deterministic, + is_v3_atomic_fp32, + how_v3_bf16_cvt, + dq, + dk, + dv, + alibi_slopes, + rng_state, + None, + ) + else: + ( + dq, + dk, + dv, + softmax_d, + ) = mha_bwd( + dout, + q, + k, + v, + out, + softmax_lse, + dropout_p, + softmax_scale, + causal, + window_size_left, + window_size_right, + deterministic, + dq, + dk, + dv, + dbias, + bias, + alibi_slopes, + rng_state, + None, + custom_build_args={"md_name": md_name, "blob_gen_cmd": blob_gen_cmd}, + ) + return softmax_d + + +class FlashAttnFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_softmax, + is_grad_enabled, + is_v3_atomic_fp32: Optional[bool] = True, + how_v3_bf16_cvt: Optional[int] = 1, + ): + is_grad = is_grad_enabled and any(x.requires_grad for x in [q, k, v]) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + head_size_q_og = q.size(3) + head_size_v_og = v.size(3) + if head_size_q_og % 8 != 0: + q = torch.nn.functional.pad(q, [0, 8 - head_size_q_og % 8]) + k = torch.nn.functional.pad(k, [0, 8 - head_size_q_og % 8]) + if head_size_v_og % 8 != 0: + v = torch.nn.functional.pad(v, [0, 8 - head_size_v_og % 8]) + out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_forward( + q, + k, + v, + dropout_p, + softmax_scale, + causal=causal, + window_size_left=int(window_size[0]), + window_size_right=int(window_size[1]), + bias=bias, + alibi_slopes=alibi_slopes, + return_lse=return_lse, + return_softmax=return_softmax and dropout_p > 0, + ) + if is_grad: + ctx.save_for_backward(q, k, v, out_padded, softmax_lse, rng_state) + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.bias = bias + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + ctx.head_size_q_og = head_size_q_og + ctx.is_v3_atomic_fp32 = is_v3_atomic_fp32 + ctx.how_v3_bf16_cvt = how_v3_bf16_cvt + out = out_padded[..., :head_size_v_og] + + result = [out] + if return_lse: + result.append(softmax_lse) + if return_softmax: + result.append(S_dmask) + + return result[0] if len(result) == 1 else tuple(result) + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, rng_state = ctx.saved_tensors + dq, dk, dv = torch.zeros_like(q), torch.empty_like(k), torch.empty_like(v) + bias = ctx.bias + dbias = torch.empty_like(bias) if bias is not None else None + head_size_q_og = ctx.head_size_q_og + head_size_v_og = dout.size(3) + dout_padded = dout + if head_size_v_og % 8 != 0: + dout_padded = torch.nn.functional.pad(dout, [0, 8 - head_size_v_og % 8]) + _flash_attn_backward( + dout_padded, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + dbias, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + int(ctx.window_size[0]), + int(ctx.window_size[1]), + ctx.bias, + ctx.alibi_slopes, + ctx.deterministic, + rng_state, + ctx.is_v3_atomic_fp32, + ctx.how_v3_bf16_cvt, + ) + dq = dq[..., :head_size_q_og] # We could have padded the head dimension + dk = dk[..., :head_size_q_og] + dv = dv[..., :head_size_v_og] + return dq, dk, dv, None, None, None, None, dbias, None, None, None, None, None + + +def flash_attn_func( + q, + k, + v, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + bias=None, + alibi_slopes=None, + deterministic=True, + return_lse=False, + return_attn_probs=False, +): + """dropout_p should be set to 0.0 during evaluation + Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (batch_size, seqlen, nheads, headdim_q) + k: (batch_size, seqlen, nheads_k, headdim_q) + v: (batch_size, seqlen, nheads_k, headdim_v) + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim_q). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + bias: (seqlen_q, seqlen_k) + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (batch_size, seqlen, nheads, headdim_v). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnFunc.apply( + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_attn_probs, + torch.is_grad_enabled(), + ) + + +def _flash_attn_varlen_forward( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens_q: torch.Tensor, + cu_seqlens_k: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + softmax_scale: float, + causal: bool, + logits_soft_cap: float = 0.0, + window_size_left: int = -1, + window_size_right: int = -1, + bias: Optional[torch.Tensor] = None, + alibi_slopes: Optional[torch.Tensor] = None, + return_lse: bool = False, + return_softmax: bool = False, + block_table: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + zero_tensors: bool = False, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + # causal=true is the same as causal=false in this case + if max_seqlen_q == 1 and alibi_slopes is None: + causal = False + + md_name = "mha_varlen_fwd" + if block_table is None: + filter_fwd = "*" # get_fwd_blobs() + if q.dtype == dtypes.fp16: + md_name += "_fp16" + filter_fwd += "fp16*" + elif q.dtype == dtypes.bf16: + md_name += "_bf16" + filter_fwd += "bf16*" + if 0.0 < logits_soft_cap: + md_name += "_logits" + filter_fwd += "_logits*" + else: + md_name += "_nlogits" + filter_fwd += "_nlogits*" + if bias is not None: + md_name += "_bias" + filter_fwd += "_bias*" + elif alibi_slopes is not None: + md_name += "_alibi" + filter_fwd += "_alibi*" + else: + md_name += "_nbias" + filter_fwd += "_nbias*" + if not causal and window_size_left == -1 and window_size_right == -1: + md_name += "_nmask" + filter_fwd += "_nmask*" + else: + md_name += "_mask" + filter_fwd += "_mask*" + if return_lse: + md_name += "_lse" + filter_fwd += "_lse*" + else: + md_name += "_nlse" + filter_fwd += "_nlse*" + if dropout_p == 0: + md_name += "_ndropout" + filter_fwd += "_ndropout*" + else: + md_name += "_dropout" + filter_fwd += "_dropout*" + blob_gen_cmd = [ + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd " + "--receipt 200 --filter {} --output_dir {{}}".format(filter_fwd) + ] + blob_gen_cmd.append( + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd_splitkv " + "--receipt 200 --filter {} --output_dir {{}}".format('" @ "') + ) + blob_gen_cmd.append( + f"{AITER_CSRC_DIR}/cpp_itfs/mha_fwd_generate.py --receipt 3 --output_dir {{}}" + ) + else: + filter_fwd_splitkv1 = "*" # get_fwd_splitkv_combine_blobs() + filter_fwd_splitkv2 = "*" # get_fwd_splitkv_blobs() + if q.dtype == dtypes.fp16: + md_name += "_fp16" + filter_fwd_splitkv1 += "fp16*" + filter_fwd_splitkv2 += "fp16*" + elif q.dtype == dtypes.bf16: + md_name += "_bf16" + filter_fwd_splitkv1 += "bf16*" + filter_fwd_splitkv2 += "bf16*" + if 0.0 < logits_soft_cap: + md_name += "_logits" + filter_fwd += "_logits*" + else: + md_name += "_nlogits" + filter_fwd += "_nlogits*" + if bias is not None: + md_name += "_bias" + filter_fwd_splitkv2 += "_bias*" + elif alibi_slopes is not None: + md_name += "_alibi" + filter_fwd_splitkv2 += "_alibi*" + else: + md_name += "_nbias" + filter_fwd_splitkv2 += "_nbias*" + if not causal and window_size_left == -1 and window_size_right == -1: + md_name += "_nmask" + filter_fwd_splitkv2 += "_nmask*" + else: + md_name += "_mask" + filter_fwd_splitkv2 += "_mask*" + if return_lse: + md_name += "_lse" + filter_fwd_splitkv1 += "_lse*" + filter_fwd_splitkv2 += "_lse*" + else: + md_name += "_nlse" + filter_fwd_splitkv1 += "_nlse*" + filter_fwd_splitkv2 += "_nlse*" + md_name += "_pagedkv" + filter_fwd_splitkv2 += "_pagedkv*" + filter_fwd_splitkv = f"{filter_fwd_splitkv1}@{filter_fwd_splitkv2}" + blob_gen_cmd = [ + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd " + "--receipt 200 --filter {} --output_dir {{}}".format('" "') + ] + blob_gen_cmd.append( + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd_splitkv " + "--receipt 200 --filter {} --output_dir {{}}".format(filter_fwd_splitkv) + ) + blob_gen_cmd.append( + f"{AITER_CSRC_DIR}/cpp_itfs/mha_fwd_generate.py --receipt 3 --output_dir {{}}" + ) + + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + out, softmax_lse, S_dmask, rng_state = mha_varlen_fwd( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + logits_soft_cap, + zero_tensors, + causal, + window_size_left, + window_size_right, + return_lse, + return_softmax, + out, + block_table, + bias, + alibi_slopes, + None, + custom_build_args={"md_name": md_name, "blob_gen_cmd": blob_gen_cmd}, + ) + return out, softmax_lse, S_dmask, rng_state + + +def _flash_attn_varlen_backward( + dout: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + out: torch.Tensor, + softmax_lse: torch.Tensor, + dq: Optional[torch.Tensor], + dk: Optional[torch.Tensor], + dv: Optional[torch.Tensor], + cu_seqlens_q: torch.Tensor, + cu_seqlens_k: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + softmax_scale: float, + causal: bool, + window_size_left: int, + window_size_right: int, + alibi_slopes: Optional[torch.Tensor], + deterministic: bool, + rng_state: Optional[torch.Tensor] = None, + is_v3_atomic_fp32: Optional[bool] = True, + how_v3_bf16_cvt: Optional[int] = 1, + zero_tensors: bool = False, +) -> torch.Tensor: + md_name = "mha_varlen_bwd" + filter1 = "*" # get_bwd_dot_do_o_blobs() + filter2 = "*" # get_bwd_convert_dq_blobs() + filter3 = "*" # get_bwd_dq_dk_dv_blobs() + if q.dtype == dtypes.fp16: + md_name += "_fp16" + filter1 += "fp16*" + filter2 += "fp16*" + filter3 += "fp16*" + elif q.dtype == dtypes.bf16: + md_name += "_bf16" + filter1 += "bf16*" + filter2 += "bf16*" + filter3 += "bf16*" + if alibi_slopes is None: + md_name += "_nbias" + filter3 += "_nbias*" + else: + md_name += "_alibi" + filter3 += "_alibi*" + if not causal and window_size_left == -1 and window_size_right == -1: + md_name += "_nmask" + filter3 += "_nmask*" + else: + md_name += "_mask" + filter3 += "_mask*" + if dropout_p == 0: + md_name += "_ndropout" + filter3 += "_ndropout*" + else: + md_name += "_dropout" + filter3 += "_dropout*" + if deterministic: + md_name += "_deterministic" + filter2 += "_deterministic*" + filter3 += "_deterministic*" + else: + md_name += "_ndeterministic" + filter2 += "_ndeterministic*" + filter3 += "_ndeterministic*" + filter = f"{filter1}@{filter2}@{filter3}" + + blob_gen_cmd = [ + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d bwd " + "--receipt 400 --filter {} --output_dir {{}}".format(filter), + f"{AITER_CSRC_DIR}/cpp_itfs/mha_bwd_generate.py --receipt 1 --output_dir {{}}", + ] + + (_, nhead_q, hdim_q) = q.shape + + nhead_k = v.shape[-2] + hdim_v = v.shape[-1] + + # mask + window_size_left = -1 if window_size_left >= max_seqlen_k else window_size_left + window_size_right = -1 if window_size_right >= max_seqlen_k else window_size_right + mask = causal == True and window_size_left == -1 # causal mask + nmask = ( + causal == False and window_size_left == -1 and window_size_right == -1 + ) # no mask + + def pssk(): + # only for hd64 a32 causal/no causal, fp16/bf16-rtne/rtna/rtz cases + # FIXME: Currently we only support mask_type == mask_enum::no_mask + # Because python side only support mask_enum::bottom_right + # However v3 kernel only support mask_enum::top_left + # bwd_hd64_bf16_a32_rtne_pssk_group + # bwd_hd64_bf16_a32_rtna_pssk_group + # bwd_hd64_bf16_a32_rtz_pssk_group + # bwd_hd64_bf16_causal_a32_rtne_pssk_group + # bwd_hd64_bf16_causal_a32_rtna_pssk_group + # bwd_hd64_bf16_causal_a32_rtz_pssk_group + # bwd_hd64_fp16_a32_pssk_group + # bwd_hd64_fp16_causal_a32_pssk_group + # bwd_hd128_bf16_a32_rtne_pssk_group + # bwd_hd128_bf16_a32_rtna_pssk_group + # bwd_hd128_bf16_a32_rtz_pssk_group + # bwd_hd128_bf16_causal_a32_rtne_pssk_group + # bwd_hd128_bf16_causal_a32_rtna_pssk_group + # bwd_hd128_bf16_causal_a32_rtz_pssk_group + # bwd_hd128_fp16_a32_pssk_group + # bwd_hd128_fp16_causal_a32_pssk_group + ret = ( + is_v3_atomic_fp32 == True + ) # nhead_stride_dq_acc >= stride_dq_acc must be guaranteed + ret &= hdim_q == 64 or hdim_q == 128 + ret &= nmask # TODO: or (mask and mask_type == mask_enum::mask_top_left) + + return ret + + def psskddv(): + # bwd_hd128_bf16_a32_rtne_psskddv_group + # bwd_hd128_bf16_a32_rtna_psskddv_group + # bwd_hd128_bf16_a32_rtz_psskddv_group + # bwd_hd128_bf16_causal_a32_rtne_psskddv_group + # bwd_hd128_bf16_causal_a32_rtna_psskddv_group + # bwd_hd128_bf16_causal_a32_rtz_psskddv_group + # bwd_hd128_fp16_a32_psskddv_group + # bwd_hd128_fp16_causal_a32_psskddv_group + ret = ( + is_v3_atomic_fp32 == True + ) # nhead_stride_dq_acc >= stride_dq_acc must be guaranteed + ret &= hdim_q > 64 and hdim_q < 128 + ret &= nmask # TODO: or (mask and mask_type == mask_enum::mask_top_left) + + return ret + + def can_impl_fmha_v3_bwd(): + # basic + ret = alibi_slopes is None + # ret &= bias is None + # ret &= dbias is None + ret &= dropout_p == 0.0 + ret &= deterministic == False + ret &= hdim_q == hdim_v + ret &= nhead_q % nhead_k == 0 + ret &= hdim_q >= 64 and hdim_q <= 128 and hdim_q % 8 == 0 + ret &= mask or nmask + ret &= pssk() or psskddv() + + return ret + + # dq, dk, dv are allocated by us so they should already be contiguous + dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)] + if can_impl_fmha_v3_bwd(): + ( + dq, + dk, + dv, + softmax_d, + ) = fmha_v3_varlen_bwd( + dout, + q, + k, + v, + out, + softmax_lse, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + zero_tensors, + causal, + window_size_left, + window_size_right, + deterministic, + is_v3_atomic_fp32, + how_v3_bf16_cvt, + dq, + dk, + dv, + alibi_slopes, + rng_state, + None, + ) + else: + ( + dq, + dk, + dv, + softmax_d, + ) = mha_varlen_bwd( + dout, + q, + k, + v, + out, + softmax_lse, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + zero_tensors, + causal, + window_size_left, + window_size_right, + deterministic, + dq, + dk, + dv, + alibi_slopes, + rng_state, + None, + custom_build_args={"md_name": md_name, "blob_gen_cmd": blob_gen_cmd}, + ) + return softmax_d + + +class FlashAttnVarlenFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + logits_soft_cap, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_softmax, + block_table, + out, + is_grad_enabled, + is_v3_atomic_fp32: Optional[bool] = True, + how_v3_bf16_cvt: Optional[int] = 1, + ): + is_grad = is_grad_enabled and any(x.requires_grad for x in [q, k, v]) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + head_size_q_og = q.size(-1) + head_size_v_og = v.size(-1) + if head_size_q_og % 8 != 0: + q = torch.nn.functional.pad(q, [0, 8 - head_size_q_og % 8]) + k = torch.nn.functional.pad(k, [0, 8 - head_size_q_og % 8]) + if head_size_v_og % 8 != 0: + v = torch.nn.functional.pad(v, [0, 8 - head_size_v_og % 8]) + out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal=causal, + logits_soft_cap=logits_soft_cap, + window_size_left=window_size[0], + window_size_right=window_size[1], + bias=bias, + alibi_slopes=alibi_slopes, + return_lse=return_lse, + return_softmax=return_softmax and dropout_p > 0, + block_table=block_table, + out=out, + ) + if is_grad: + ctx.save_for_backward( + q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state + ) + ctx.dropout_p = dropout_p + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.bias = bias + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + ctx.head_size_q_og = head_size_q_og + ctx.is_v3_atomic_fp32 = is_v3_atomic_fp32 + ctx.how_v3_bf16_cvt = how_v3_bf16_cvt + + out = out_padded[..., :head_size_v_og] + + result = [out] + if return_lse: + result.append(softmax_lse) + if return_softmax: + result.append(S_dmask) + + return result[0] if len(result) == 1 else tuple(result) + + @staticmethod + def backward(ctx, dout, *args): + ( + q, + k, + v, + out, + softmax_lse, + cu_seqlens_q, + cu_seqlens_k, + rng_state, + ) = ctx.saved_tensors + dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v) + bias = ctx.bias + dbias = torch.empty_like(bias) if bias is not None else None + head_size_q_og = ctx.head_size_q_og + head_size_v_og = dout.size(2) + dout_padded = dout + if head_size_v_og % 8 != 0: + dout_padded = torch.nn.functional.pad(dout, [0, 8 - head_size_v_og % 8]) + # TODO - dbias + _flash_attn_varlen_backward( + dout_padded, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + ctx.max_seqlen_q, + ctx.max_seqlen_k, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ctx.window_size[0], + ctx.window_size[1], + ctx.alibi_slopes, + ctx.deterministic, + rng_state=rng_state, + is_v3_atomic_fp32=ctx.is_v3_atomic_fp32, + how_v3_bf16_cvt=ctx.how_v3_bf16_cvt, + ) + dq = dq[..., :head_size_q_og] # We could have padded the head dimension + dk = dk[..., :head_size_q_og] + dv = dv[..., :head_size_v_og] + return ( + dq, + dk, + dv, + None, + None, + None, + None, + None, + None, + None, + None, + None, + dbias, + None, + None, + None, + None, + None, + None, + None, + ) + + +def flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p=0.0, + softmax_scale=None, + logits_soft_cap=0.0, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + bias=None, + alibi_slopes=None, + deterministic=False, + return_lse=False, + return_attn_probs=False, + block_table=None, + out=None, +): + """dropout_p should be set to 0.0 during evaluation + Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (total_q, nheads, headdim_q), where total_q = total number of query tokens in the batch. + k: (total_k, nheads_k, headdim_q), where total_k = total number of key tokens in the batch. + v: (total_k, nheads_k, headdim_v), where total_k = total number of key tokens in the batch. + cu_seqlens_q: (batch_size + 1,), dtype dtypes.i32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + cu_seqlens_k: (batch_size + 1,), dtype dtypes.i32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_q: int. Maximum query sequence length in the batch. + max_seqlen_k: int. Maximum key sequence length in the batch. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim_q). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + bias: (seqlen_q, seqlen_k) + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (total, nheads, headdim_v). + softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnVarlenFunc.apply( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + logits_soft_cap, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_attn_probs, + block_table, + out, + torch.is_grad_enabled(), + ) + + +@compile_ops("module_mha_batch_prefill", fc_name="mha_batch_prefill") +def mha_batch_prefill( + q: Tensor, + k: Tensor, + v: Tensor, + cu_seqlens_q: Tensor, + kv_indptr: Tensor, + kv_page_indices: Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + softmax_scale: float, + logits_soft_cap: float, + zero_tensors: bool, + is_causal: bool, + window_size_left: int, + window_size_right: int, + return_softmax_lse: bool, + return_dropout_randval: bool, + out: Optional[Tensor] = None, + alibi_slopes: Optional[Tensor] = None, + gen: Optional[Generator] = None, +): ... + + +def _mha_batch_prefill( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens_q: torch.Tensor, + kv_indptr: torch.Tensor, + kv_page_indices: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + softmax_scale: float, + causal: bool, + logits_soft_cap: float = 0.0, + window_size_left: int = -1, + window_size_right: int = -1, + alibi_slopes: Optional[torch.Tensor] = None, + return_lse: bool = False, + return_softmax: bool = False, + zero_tensors: bool = False, + out: torch.Tensor = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + # causal=true is the same as causal=false in this case + if max_seqlen_q == 1 and alibi_slopes is None: + causal = False + + md_name = "mha_batch_prefill" + filter_fwd = "*" # get_fwd_blobs() + if q.dtype == torch.float16: + md_name += "_fp16" + filter_fwd += "fp16*" + elif q.dtype == torch.bfloat16: + md_name += "_bf16" + filter_fwd += "bf16*" + if 0.0 < logits_soft_cap: + md_name += "_logits" + filter_fwd += "_logits*" + else: + md_name += "_nlogits" + filter_fwd += "_nlogits*" + if alibi_slopes is None: + md_name += "_nbias" + filter_fwd += "_nbias*" + else: + md_name += "_alibi" + filter_fwd += "_alibi*" + if not causal and window_size_left == -1 and window_size_right == -1: + md_name += "_nmask" + filter_fwd += "_nmask*" + else: + md_name += "_mask" + filter_fwd += "_mask*" + if return_lse: + md_name += "_lse" + filter_fwd += "_lse*" + else: + md_name += "_nlse" + filter_fwd += "_nlse*" + if dropout_p == 0: + md_name += "_ndropout" + filter_fwd += "_ndropout*" + else: + md_name += "_dropout" + filter_fwd += "_dropout*" + blob_gen_cmd = [ + f"{CK_DIR}/example/ck_tile/01_fmha/generate.py -d batch_prefill " + "--receipt 200 --filter {} --output_dir {{}}".format(filter_fwd) + ] + blob_gen_cmd.append( + f"{AITER_CSRC_DIR}/cpp_itfs/mha_fwd_generate.py --receipt 4 --output_dir {{}}" + ) + + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + out, softmax_lse, S_dmask, rng_state = mha_batch_prefill( + q, + k, + v, + cu_seqlens_q, + kv_indptr, + kv_page_indices, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + logits_soft_cap, + zero_tensors, + causal, + window_size_left, + window_size_right, + return_lse, + return_softmax, + out, + alibi_slopes, + None, + custom_build_args={"md_name": md_name, "blob_gen_cmd": blob_gen_cmd}, + ) + return out, softmax_lse, S_dmask, rng_state + + +def mha_batch_prefill_func( + q, + k, + v, + cu_seqlens_q, + kv_indptr, + kv_page_indices, + max_seqlen_q, + max_seqlen_k, + dropout_p=0.0, + softmax_scale=None, + logits_soft_cap=0.0, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + alibi_slopes=None, + deterministic=False, + return_lse=False, + return_attn_probs=False, + out=None, +): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + head_size_q_og = q.size(2) + head_size_v_og = v.size(2) + if head_size_q_og % 8 != 0: + q = torch.nn.functional.pad(q, [0, 8 - head_size_q_og % 8]) + k = torch.nn.functional.pad(k, [0, 8 - head_size_q_og % 8]) + if head_size_v_og % 8 != 0: + v = torch.nn.functional.pad(v, [0, 8 - head_size_v_og % 8]) + out_padded, softmax_lse, S_dmask, rng_state = _mha_batch_prefill( + q, + k, + v, + cu_seqlens_q, + kv_indptr, + kv_page_indices, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal=causal, + logits_soft_cap=logits_soft_cap, + window_size_left=window_size[0], + window_size_right=window_size[1], + alibi_slopes=alibi_slopes, + return_lse=return_lse, + return_softmax=return_attn_probs and dropout_p > 0, + out=out, + ) + out = out_padded[..., :head_size_v_og] + + result = [out] + if return_lse: + result.append(softmax_lse) + if return_attn_probs: + result.append(S_dmask) + + return result[0] if len(result) == 1 else tuple(result) diff --git a/aiter/ops/moe_c_op.py b/aiter/ops/moe_c_op.py new file mode 100644 index 0000000000000000000000000000000000000000..8ce3206c135153944cfdb9fcf9ea47aa22fce4a6 --- /dev/null +++ b/aiter/ops/moe_c_op.py @@ -0,0 +1,383 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +import torch +from torch import Tensor +from typing import Optional,List +from ..jit.core import ( + compile_ops, +) +from .enum import ActivationType, Enum, QuantType + + + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_gemm_marlin_w8a8( + input: torch.Tensor, + b_qweight : torch.Tensor, + output : torch.Tensor, + a_scale: torch.Tensor, + b_scale : torch.Tensor, + topk_weights : Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids : torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k : int, + mode :int, + delta: int)-> torch.Tensor: + """ + --------------------------------------------------------------- + # MoE 场景下 8bit 量化的 GEMM 计算(Marlin 优化版) + + ## 关键前置条件 + 必须配合对应的权重 Shuffle 函数使用,否则会导致计算结果完全错误: + - GEMM1 场景:使用 ops.marlin_weights 处理权重 + - GEMM2 场景:使用 ops.marlin_weights_ours 处理权重 + + + + + + --------------------------------------------------------------- + """ + + pass + + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_gemm_marlin_w4a8( + input: torch.Tensor, + b_qweight : torch.Tensor, + output : torch.Tensor, + a_scale: torch.Tensor, + b_scale : torch.Tensor, + topk_weights : Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids : torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k : int, + mode :int, + delta: int)-> torch.Tensor: + """ + --------------------------------------------------------------- + # MoE 场景下 8bit 量化的 GEMM 计算(Marlin 优化版) + + ## 关键前置条件 + 必须配合对应的权重 Shuffle 函数使用,否则会导致计算结果完全错误: + - GEMM1 场景:使用 ops.marlin_weights 处理权重 + - GEMM2 场景:使用 ops.marlin_weights_ours 处理权重 + + + + + + --------------------------------------------------------------- + """ + + pass + + + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_gemm_marlin_w8a8_fp8( + input: torch.Tensor, + b_qweight : torch.Tensor, + output : torch.Tensor, + a_scale: torch.Tensor, + b_scale : torch.Tensor, + topk_weights : Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids : torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k : int, + mode :int, + delta: int)-> torch.Tensor: + """ + --------------------------------------------------------------- + # MoE 场景下 8bit 量化的 GEMM 计算(Marlin 优化版) + + ## 关键前置条件 + 必须配合对应的权重 Shuffle 函数使用,否则会导致计算结果完全错误: + - GEMM1 场景:使用 ops.marlin_weights 处理权重 + - GEMM2 场景:使用 ops.marlin_weights_ours 处理权重 + + + + + + --------------------------------------------------------------- + """ + + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_gemm_marlin_w4a16( + input: torch.Tensor, + b_qweight : torch.Tensor, + output : torch.Tensor, + b_scale: torch.Tensor, + b_zeros : torch.Tensor, + topk_weights : Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids : torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k : int, + mode :int, + delta: int)-> torch.Tensor: + """ + --------------------------------------------------------------- + # MoE 场景下 4bit 量化的 GEMM 计算(Marlin 优化版) + + ## 关键前置条件 + 必须配合对应的权重 Shuffle 函数使用,否则会导致计算结果完全错误: + + + --------------------------------------------------------------- + """ + + pass + + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_w8a8_gemm_block_wise( + input: torch.Tensor, + a_scales: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + group_size_n: int, + group_size_k: int, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + kloops: int, + nloops: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_w8a8_gemm_block_wise_kernel2( + input: torch.Tensor, + a_scales: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + group_size_n: int, + group_size_k: int, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + kloops: int, + nloops: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_w8a8_gemm_block_wise_fp8( + input: torch.Tensor, + a_scales: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + group_size_n: int, + group_size_k: int, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + kloops: int, + nloops: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_w8a8_gemm_block_wise_kernel2_fp8( + input: torch.Tensor, + a_scales: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + group_size_n: int, + group_size_k: int, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + kloops: int, + nloops: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_w8a16_gemm_awq( + input: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_w8a16_gemm_block_wise( + input: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + group_size_n: int, + group_size_k: int, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_wna16_gemm_base( + input: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k: int, + BLOCK_SIZE_M: int, + BLOCK_SIZE_N: int, + BLOCK_SIZE_K: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_wna16_gemm( + input: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + kloops: int, + nloops: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_wna16_gemm_2( + input: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: Optional[torch.Tensor], + topk_weights: Optional[torch.Tensor], + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k: int, + BLOCK_SIZE_m: int, + BLOCK_SIZE_n: int, + BLOCK_SIZE_k: int, + kloops: int, + nloops: int, + bit: int +) -> torch.Tensor: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_topk_softmax( + topk_weights: torch.Tensor, # 移除 C++ 引用 & + topk_indices: torch.Tensor, # 移除 C++ 引用 & + token_expert_indices: torch.Tensor, # 移除 C++ 引用 & + gating_output: torch.Tensor # 移除 C++ 引用 & +) -> None: # 替代 -> None (C++ 中的 void) + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_silu_and_mul( out : torch.Tensor, + input : torch.Tensor) -> None: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_sum( + input: torch.Tensor, # 移除 C++ 引用 & + output: torch.Tensor, # 移除 C++ 引用 & + topk_ids: torch.Tensor +) -> None: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor +) -> None: + pass + +@compile_ops("module_moe_c_kernel") +def moe_c_sgl_moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor +) -> None: + pass + + + diff --git a/aiter/ops/moe_op.py b/aiter/ops/moe_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b7dfd84428317ab69a873a6fcf7046927c0b48a6 --- /dev/null +++ b/aiter/ops/moe_op.py @@ -0,0 +1,433 @@ +# SPDX-License-Identifier: MIT +import torch +from torch import Tensor +from typing import Optional,List +from ..jit.core import ( + compile_ops, +) +from .enum import ActivationType, Enum, QuantType + + +@compile_ops("module_moe_utils") +def topk_softmax( + topk_weights: Tensor, + topk_indices: Tensor, + token_expert_indices: Tensor, + gating_output: Tensor, + need_renorm: bool, +) -> None: ... + + +@compile_ops("module_moe_utils") +def moe_sum(input: Tensor, output: Tensor)->None: ... + +@compile_ops("module_moe_sum") +def asm_moe_sum(input: Tensor, output: Tensor, sorted_ids: Tensor)->None: ... + +@compile_ops("module_moe_utils") +def sgl_moe_align_block_size(topk_ids: Tensor, num_experts: int, + block_size: int, sorted_token_ids: Tensor, + experts_ids: Tensor, + num_tokens_post_pad: Tensor) -> None: ... + +@compile_ops("module_moe_utils") +def moe_align_block_size( + topk_ids: Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: Tensor, + experts_ids: Tensor, + num_tokens_post_pad: Tensor, +) -> None: ... + + +@compile_ops("module_moe_asm") +def asm_fmoe_stage1( + out: Tensor, + input: Tensor, + gate: Tensor, + down: Tensor, + sorted_token_ids: Tensor, + sorted_weights: Tensor, + sorted_expert_ids: Tensor, + num_valid_ids: Tensor, + top_k: int, + scale_a: Optional[torch.Tensor] = None, + scale_b: Optional[torch.Tensor] = None, + zero_points: Optional[torch.Tensor] = None, + mode: Optional[int] = 0, + solidx: Optional[int] = 0, + block_size: Optional[int] = 16, + persist_groups: Optional[int] = 0, +) -> None: ... + +@compile_ops("module_moe_asm") +def asm_fmoe_stage2( + out: Tensor, + input: Tensor, + gate: Tensor, + down: Tensor, + sorted_token_ids: Tensor, + sorted_weights: Tensor, + sorted_expert_ids: Tensor, + num_valid_ids: Tensor, + top_k: int, + scale_a: Optional[torch.Tensor] = None, + scale_b: Optional[torch.Tensor] = None, + zero_points: Optional[torch.Tensor] = None, + mode: Optional[int] = 0, + solidx: Optional[int] = 0, + block_size: Optional[int] = 16, + persist_groups: Optional[int] = 0, +)-> None: ... + +@compile_ops("module_moe_asm") +def asm_fmoe_a8( + out: Tensor, + input: Tensor, + gate: Tensor, + down: Tensor, + sorted_token_ids: Tensor, + sorted_weights: Tensor, + sorted_expert_ids: Tensor, + num_valid_ids: Tensor, + top_k: int, + scale_a: Optional[torch.Tensor] = None, + scale_b: Optional[torch.Tensor] = None, + zero_points: Optional[torch.Tensor] = None, + mode: Optional[int] = 0, + solidx: Optional[int] = 0, + out_type:Optional[int] = 0, + persist_groups:Optional[int] = 0, + use_shuffle:Optional[int] = 0, +)-> None: ... + +@compile_ops("module_moe_asm") +def asm_moe_get_solutions( + hidden_states: Tensor, + w1: Tensor, + w2: Tensor, + topk_weights: Tensor, + topk_ids: Tensor, + use_int8_w8a16: Optional[bool] = False, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a8: Optional[bool] = False, + use_int4_w4a8: Optional[bool] = False, + use_fp8_w8a8: Optional[bool] = False, + per_channel_quant: Optional[bool] = False, + w1_zp: Optional[Tensor] = None, + w2_zp: Optional[Tensor] = None, + w1_scale: Optional[Tensor] = None, + w2_scale: Optional[Tensor] = None, + a1_scale: Optional[Tensor] = None, + a2_scale: Optional[Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + expert_mask: Optional[Tensor] = None, +) -> list[str]: ... + +# @compile_ops("module_moe_asm") +# def fmoe( +# out: Tensor, +# input: Tensor, +# gate: Tensor, +# down: Tensor, +# sorted_token_ids: Tensor, +# sorted_weights: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# topk: int, +# ): ... + + +# @compile_ops("module_moe_asm") +# def fmoe_int8_g1u0( +# out: Tensor, +# input: Tensor, +# gate: Tensor, +# down: Tensor, +# sorted_token_ids: Tensor, +# sorted_weights: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# topk: int, +# input_scale: Tensor, +# fc1_scale: Tensor, +# fc2_scale: Tensor, +# fc2_smooth_scale: Tensor, +# activation: Optional[Enum] = ActivationType.Silu, +# ): ... + + +# @compile_ops("module_moe_asm") +# def fmoe_g1u1( +# out: Tensor, +# input: Tensor, +# gate: Tensor, +# down: Tensor, +# sorted_token_ids: Tensor, +# sorted_weights: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# topk: int, +# input_scale: Tensor, +# fc1_scale: Tensor, +# fc2_scale: Tensor, +# fc2_smooth_scale: Optional[Tensor] = None, +# activation: Optional[Enum] = ActivationType.Silu, +# ): ... + + +# @compile_ops("module_moe_asm") +# def fmoe_g1u1_tkw1( +# out: Tensor, +# input: Tensor, +# gate: Tensor, +# down: Tensor, +# sorted_token_ids: Tensor, +# sorted_weights: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# topk: int, +# input_scale: Tensor, +# fc1_scale: Tensor, +# fc2_scale: Tensor, +# fc2_smooth_scale: Optional[Tensor] = None, +# activation: Optional[Enum] = ActivationType.Silu, +# ): ... + + +# @compile_ops("module_moe_asm") +# def fmoe_int8_g1u0_a16( +# out: Tensor, +# input: Tensor, # bf16 +# gate: Tensor, +# down: Tensor, +# sorted_token_ids: Tensor, +# sorted_weights: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# topk: int, +# fc1_scale: Tensor, +# fc2_scale: Tensor, +# fc1_smooth_scale: Tensor, +# fc2_smooth_scale: Tensor, +# ): ... + + +# @compile_ops("module_moe_asm") +# def fmoe_g1u1_a16( +# out: Tensor, +# input: Tensor, # bf16 +# gate: Tensor, +# down: Tensor, +# sorted_token_ids: Tensor, +# sorted_weights: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# topk: int, +# fc1_scale: Tensor, +# fc2_scale: Tensor, +# fc1_smooth_scale: Tensor, +# fc2_smooth_scale: Tensor, +# ): ... + + +# @compile_ops("module_moe_asm") +# def fmoe_fp8_blockscale_g1u1( +# out: Tensor, +# input: Tensor, +# gate: Tensor, +# down: Tensor, +# sorted_token_ids: Tensor, +# sorted_weights: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# topk: int, +# input_scale: Tensor, +# fc1_scale: Tensor, +# fc2_scale: Tensor, +# fc_scale_blkn: int = 128, +# fc_scale_blkk: int = 128, +# fc2_smooth_scale: Optional[Tensor] = None, +# activation: ActivationType = ActivationType.Silu, +# ): ... + + +# @compile_ops("module_moe_asm") +# def moe_stage1_g1u1( +# input: torch.Tensor, +# w1: torch.Tensor, +# w2: torch.Tensor, +# sorted_token_ids: torch.Tensor, +# sorted_expert_ids: torch.Tensor, +# num_valid_ids: torch.Tensor, +# out: torch.Tensor, +# inter_dim: int, +# kernelName: str, +# block_m: int, +# ksplit: int = 0, +# activation: ActivationType = ActivationType.Silu, +# quant_type: QuantType = QuantType.No, +# a1_scale: Optional[torch.Tensor] = None, +# w1_scale: Optional[torch.Tensor] = None, +# sorted_weights: Optional[torch.Tensor] = None, +# ) -> None: ... + + +@compile_ops("module_moe") +def ck_moe( + hidden_states: Tensor, + w1: Tensor, + w2: Tensor, + topk_weights: Tensor, + topk_ids: Tensor, + use_int8_w8a16: Optional[bool] = False, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a8_block: Optional[bool] = False, + use_int4_w4a8_block: Optional[bool] = False, + w1_zp: Optional[Tensor] = None, + w2_zp: Optional[Tensor] = None, + w1_scale: Optional[Tensor] = None, + w2_scale: Optional[Tensor] = None, + a1_scale: Optional[Tensor] = None, + a2_scale: Optional[Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + solution_id: Optional[int] = 0, + expert_mask: Optional[Tensor] = None, +)-> torch.Tensor: ... + +@compile_ops("module_moe") +def ck_shuffle_moe( + hidden_states: Tensor, + w1: Tensor, + w2: Tensor, + topk_weights: Tensor, + topk_ids: Tensor, + use_int8_w8a16: Optional[bool] = False, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a8_block: Optional[bool] = False, + use_int4_w4a8_block: Optional[bool] = False, + w1_zp: Optional[Tensor] = None, + w2_zp: Optional[Tensor] = None, + w1_scale: Optional[Tensor] = None, + w2_scale: Optional[Tensor] = None, + a1_scale: Optional[Tensor] = None, + a2_scale: Optional[Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + solution_id: Optional[int] = 0, + expert_mask: Optional[Tensor] = None, +)-> torch.Tensor: ... + +@compile_ops("module_moe") +def ck_moe_get_solutions( + hidden_states: Tensor, + w1: Tensor, + w2: Tensor, + topk_weights: Tensor, + topk_ids: Tensor, + use_int8_w8a16: Optional[bool] = False, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a8_block: Optional[bool] = False, + use_int4_w4a8_block: Optional[bool] = False, + w1_zp: Optional[Tensor] = None, + w2_zp: Optional[Tensor] = None, + w1_scale: Optional[Tensor] = None, + w2_scale: Optional[Tensor] = None, + a1_scale: Optional[Tensor] = None, + a2_scale: Optional[Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + expert_mask: Optional[Tensor] = None, +) -> list[int]: ... + +@compile_ops("module_moe") +def ck_moe_stage_1( + hidden_states: Tensor, + w1: Tensor, + w2: Tensor, + sorted_token_ids: Tensor, + sorted_expert_ids: Tensor, + tokens_positions_per_expert: Tensor, + num_valid_ids: Tensor, + out: Tensor, + topk: int, + use_int8_w8a8_block: Optional[bool] = False, + use_fp8_w8a8_block: Optional[bool] = False, + w1_scale: Optional[Tensor] = None, + a1_scale: Optional[Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + sorted_weights: Optional[Tensor] = None, + act_op: Optional[int] = 0, +)->None: ... + + +@compile_ops("module_moe") +def ck_moe_stage_2( + inter_states: Tensor, # the output of stage 1 + w1: Tensor, + w2: Tensor, + sorted_token_ids: Tensor, + sorted_expert_ids: Tensor, + tokens_positions_per_expert: Tensor, + num_valid_ids: Tensor, + out: Tensor, + topk: int, + use_int8_w8a8_block: Optional[bool] = False, + use_fp8_w8a8_block: Optional[bool] = False, + w2_scale: Optional[Tensor] = None, + a2_scale: Optional[Tensor] = None, + block_shape_n: Optional[int] = 0, + block_shape_k: Optional[int] = 0, + block_m: Optional[int] = 32, + sorted_weights: Optional[Tensor] = None, +)->None: ... + +@compile_ops("module_moe") +def ck_moe_per_token_quant( + input: Tensor, + out_quant: Tensor, + out_scale: Tensor, +)->None: ... + +# @compile_ops("module_moe_ck2stages") +# def ck_moe_stage1( +# hidden_states: Tensor, +# w1: Tensor, +# w2: Tensor, +# sorted_token_ids: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# out: Tensor, +# topk: int, +# w1_scale: Optional[Tensor] = None, +# a1_scale: Optional[Tensor] = None, +# block_m: Optional[int] = 32, +# sorted_weights: Optional[Tensor] = None, +# act_op: Optional[int] = 0, +# ): ... + + +# @compile_ops("module_moe_ck2stages") +# def ck_moe_stage2( +# inter_states: Tensor, +# w1: Tensor, +# w2: Tensor, +# sorted_token_ids: Tensor, +# sorted_expert_ids: Tensor, +# num_valid_ids: Tensor, +# out: Tensor, +# topk: int, +# w2_scale: Optional[Tensor] = None, +# a2_scale: Optional[Tensor] = None, +# block_m: Optional[int] = 32, +# sorted_weights: Optional[Tensor] = None, +# ): ... diff --git a/aiter/ops/moe_sorting.py b/aiter/ops/moe_sorting.py new file mode 100644 index 0000000000000000000000000000000000000000..13e501c17da90fe28710c95fa35bc53398f626f7 --- /dev/null +++ b/aiter/ops/moe_sorting.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: MIT +import torch +from typing import Optional +from ..jit.core import compile_ops + +MD_NAME = "module_moe_sorting" + + +@compile_ops("module_moe_sorting") +def moe_sorting_fwd( + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + sorted_weights: torch.Tensor, + sorted_expert_ids: torch.Tensor, + tokens_positions_per_expert: torch.Tensor, + num_valid_ids: torch.Tensor, + moe_buf: torch.Tensor, + num_experts: int, + unit_size: int, + local_expert_mask: Optional[torch.Tensor] = None, +) ->None: ... diff --git a/aiter/ops/norm.py b/aiter/ops/norm.py new file mode 100644 index 0000000000000000000000000000000000000000..17e5faf0756b8737ab69ae600343a8d9c76226a8 --- /dev/null +++ b/aiter/ops/norm.py @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: MIT +import torch +from torch import Tensor +from typing import Optional +from ..jit.core import compile_ops + +MD_NAME = "module_norm" + + +def gen_layer_norm_fake_tensors( + input: Tensor, + # normalized_shape: List[int], + weight: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + eps: float = 1e-5, + x_bias: Optional[Tensor] = None, +) -> Tensor: + return torch.empty_like( + input, + dtype=input.dtype, + device=input.device, + ) + + +@compile_ops( + "module_norm", fc_name="layernorm2d_fwd", gen_fake=gen_layer_norm_fake_tensors +) +def layer_norm( + input: Tensor, + # normalized_shape: List[int], + weight: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + epsilon: float = 1e-5, + x_bias: Optional[Tensor] = None, +) -> Tensor: ... + + +@compile_ops( + "module_norm", fc_name="layernorm2d_fwd", gen_fake=gen_layer_norm_fake_tensors +) +def layernorm2d_fwd( + input: Tensor, + # normalized_shape: List[int], + weight: Optional[Tensor] = None, + bias: Optional[Tensor] = None, + epsilon: float = 1e-5, + x_bias: Optional[Tensor] = None, +) -> Tensor: ... + + +@compile_ops("module_norm") +def layernorm2d_fwd_with_add( + out: Tensor, + input: Tensor, + residual_in: Tensor, + residual_out: Tensor, + weight: Tensor, + bias: Tensor, + epsilon: float, + x_bias: Optional[Tensor] = None, +) -> None: ... + + +@compile_ops("module_norm") +def layernorm2d_fwd_with_smoothquant( + out: Tensor, + input: Tensor, + xscale: Tensor, + yscale: Tensor, + weight: Tensor, + bias: Tensor, + epsilon: float, + x_bias: Optional[Tensor] = None, +) -> None: ... + + +@compile_ops("module_norm") +def layernorm2d_fwd_with_add_smoothquant( + out: Tensor, + input: Tensor, + residual_in: Tensor, + residual_out: Tensor, + xscale: Tensor, + yscale: Tensor, + weight: Tensor, + bias: Tensor, + epsilon: float, + x_bias: Optional[Tensor] = None, +) -> None: ... + + +@compile_ops("module_norm") +def layernorm2d_fwd_with_dynamicquant( + out: Tensor, + input: Tensor, + yscale: Tensor, + weight: Tensor, + bias: Tensor, + epsilon: float, + x_bias: Optional[Tensor] = None, +) -> None: ... + + +@compile_ops("module_norm") +def layernorm2d_fwd_with_add_dynamicquant( + out: Tensor, + input: Tensor, + residual_in: Tensor, + residual_out: Tensor, + yscale: Tensor, + weight: Tensor, + bias: Tensor, + epsilon: float, + x_bias: Optional[Tensor] = None, +) -> None: ... + +# @compile_ops("module_norm") +# def layernorm2d_with_add_asm( +# out: Tensor, +# input: Tensor, +# residual_in: Tensor, +# residual_out: Tensor, +# weight: Tensor, +# bias: Tensor, +# epsilon: float, +# x_bias: Optional[Tensor] = None, +# ): ... +# @compile_ops("module_norm") +# def layernorm2d_with_add_smoothquant_asm( +# out: Tensor, +# input: Tensor, +# residual_in: Tensor, +# residual_out: Tensor, +# xscale: Tensor, +# yscale: Tensor, +# weight: Tensor, +# bias: Tensor, +# epsilon: float, +# x_bias: Optional[Tensor] = None, +# ): ... diff --git a/aiter/ops/pos_encoding.py b/aiter/ops/pos_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..c95e6334a95cfff60177013d808b42575302c71c --- /dev/null +++ b/aiter/ops/pos_encoding.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: MIT + +from torch import Tensor +from ..jit.core import compile_ops + +MD_NAME = "module_pos_encoding" + + +@compile_ops("module_pos_encoding") +def rotary_embedding_fwd( + positions: Tensor, + query: Tensor, + key: Tensor, + head_size: int, + cos_cache: Tensor, + sin_cache: Tensor, + is_neox: bool, + is_nope_first: bool, +) -> None: ... + + +@compile_ops("module_pos_encoding") +def batched_rotary_embedding( + positions: Tensor, + query: Tensor, + key: Tensor, + head_size: int, + cos_cache: Tensor, + sin_cache: Tensor, + is_neox: bool, + is_nope_first: bool, + rot_dim: int, + cos_sin_cache_offsets: Tensor, +) -> None: ... \ No newline at end of file diff --git a/aiter/ops/quant.py b/aiter/ops/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..605152eed01d1edc7e1af93486f917ba0eab4ea4 --- /dev/null +++ b/aiter/ops/quant.py @@ -0,0 +1,415 @@ +# SPDX-License-Identifier: MIT + +import torch +from torch import Tensor +from typing import Optional +from ..jit.core import compile_ops +import torch.nn.functional as F +import functools +from .enum import QuantType, ActivationType +from . import triton +from ..utility import dtypes, fp4_utils + + +@compile_ops("module_smoothquant") +def smoothquant_fwd( + out: Tensor, input: Tensor, x_scale: Tensor, y_scale: Tensor +) -> None: ... + + +@compile_ops("module_smoothquant") +def moe_smoothquant_fwd( + out: Tensor, input: Tensor, x_scale: Tensor, topk_ids: Tensor, y_scale: Tensor +) -> None: ... + + +# following are pure torch implement +@functools.lru_cache() +def get_dtype_max(dtype): + try: + dtypeMax = torch.finfo(dtype).max + except: + dtypeMax = torch.iinfo(dtype).max + return dtypeMax + + +def pertoken_quant( + x, + scale=None, + x_scale=None, # smooth_scale + scale_dtype=dtypes.fp32, + quant_dtype=dtypes.i8, + dtypeMax=None, +): + x = x.to(dtypes.fp32) + if x_scale is None: + hidden_states = x + else: + # smooth quant + hidden_states = x * x_scale + + if dtypeMax is None: + dtypeMax = get_dtype_max(quant_dtype) + + per_token_scale = scale + if scale is None: + # [m, 1] + per_token_amax, _ = torch.max( + input=torch.abs(hidden_states), dim=-1, keepdim=True + ) + per_token_scale = per_token_amax / dtypeMax + per_token_scale[per_token_scale == 0] = 1 + + # quant hidden_states + y = (hidden_states / per_token_scale).to(dtype=quant_dtype) + y_scale = per_token_scale.to(scale_dtype) + return y, y_scale + + +def per_1x32_f4_quant(x, scale=None, quant_dtype=dtypes.fp4x2, shuffle=False): + assert quant_dtype == dtypes.fp4x2 + block_size = 32 + F8E8M0_EXP_BIAS = 127 + F4E2M1_MAX = 6.0 + MAX_POW2 = int(torch.log2(torch.tensor(F4E2M1_MAX, dtype=torch.float32)).item()) + # dtypeMax = F4E2M1_MAX + dtypeMax = 2.0**MAX_POW2 + + shape_original = x.shape + x = x.view(-1, shape_original[-1]) + + m, n = x.shape + x = x.view(-1, block_size) + max_abs = torch.amax(torch.abs(x.float()), 1) + # max_abs = max_abs.view(torch.int32) + # max_abs = ((max_abs + 0x200000) & 0xFF800000).view(torch.float32) + + # fp8e8m0fnu_from_fp32_value + scale_e8m0_biased = fp4_utils.f32_to_e8m0(max_abs / dtypeMax) + + # Float8_e8m0fnu to float + scale_f32 = fp4_utils.e8m0_to_f32(scale_e8m0_biased) + + y = x.float() / scale_f32.view(-1, 1) + y = fp4_utils.f32_to_mxfp4(y) + y = y.view(*shape_original[:-1], -1) + scale = scale_e8m0_biased.view(m, -1).view(torch.uint8) + if shuffle: + scale = fp4_utils.e8m0_shuffle(scale) + return y, scale.view(dtypes.fp8_e8m0) + + +def per_tensor_quant( + x, scale=None, scale_dtype=dtypes.fp32, quant_dtype=dtypes.i8, dtypeMax=None +): + x = x.to(dtypes.fp32) + if scale is None: + if dtypeMax is None: + dtypeMax = get_dtype_max(quant_dtype) + scale = torch.abs(x).max() / dtypeMax + y = x / scale + + return y.to(quant_dtype), scale.view(1).to(scale_dtype) + + +def per_block_quant_wrapper(block_shape=(1, 128)): + def decorator(per_token_quant_func): + def wrapper(x, scale=None, quant_dtype=dtypes.i8): + blk_m, blk_n = block_shape + assert ( + x.shape[-1] % blk_n == 0 + ), f"block size {blk_n} not match {x.shape[-1]}" + assert blk_m == 1, "only support 1xN block, TODO: support MxN" + m, n = x.shape + x = x.view(-1, blk_n) + y, scale = per_token_quant_func(x, scale=scale, quant_dtype=quant_dtype) + return y.view(m, n), scale.view(m, n // blk_n) + + return wrapper + + return decorator + + +@functools.lru_cache() +def get_torch_quant(qType): + tmp = { + QuantType.No: lambda *a, **k: (a[0], None), + QuantType.per_Tensor: per_tensor_quant, + QuantType.per_Token: pertoken_quant, + QuantType.per_1x32: per_1x32_f4_quant, + QuantType.per_1x128: per_block_quant_wrapper((1, 128))(pertoken_quant), + } + + def raise_NotImplementedError(*a, **k): + raise NotImplementedError(f"unsupported quant type {qType=}") + + return tmp.get(qType, raise_NotImplementedError) + + +@functools.lru_cache() +def get_hip_quant(qType): + tmp = { + QuantType.No.value: lambda *a, **k: (a[0], None), + QuantType.per_Tensor.value: per_tensor_quant_hip, + QuantType.per_Token.value: per_token_quant_hip, + QuantType.per_1x32.value: per_1x32_f4_quant_hip, + QuantType.per_1x128.value: functools.partial( + per_group_quant_hip, group_size=128 + ), + } + + def raise_NotImplementedError(*a, **k): + raise NotImplementedError(f"unsupported quant type {qType=}") + + return tmp.get(qType.value, raise_NotImplementedError) + + +@functools.lru_cache() +def get_triton_quant(qType): + tmp = { + QuantType.No: lambda *a, **k: (a[0], None), + QuantType.per_Tensor: per_tensor_quant_triton, + QuantType.per_Token: per_token_quant_triton, + QuantType.per_1x32: per_1x32_f4_quant_triton, + QuantType.per_1x128: per_block_quant_wrapper((1, 128))(per_token_quant_triton), + } + + def raise_NotImplementedError(*a, **k): + raise NotImplementedError(f"unsupported quant type {qType=}") + + return tmp.get(qType, raise_NotImplementedError) + + +def per_token_quant_hip( + x, + scale=None, + quant_dtype=dtypes.i8, + num_rows: Optional[torch.tensor] = None, + num_rows_factor=1, +): + shape = x.shape + device = x.device + if scale is None: + scale = torch.empty((*shape[:-1], 1), dtype=dtypes.fp32, device=device) + else: + raise ValueError("unsupported: static per token quant") + + if 1: + y = torch.empty(shape, dtype=quant_dtype, device=device) + dynamic_per_token_scaled_quant( + y, x, scale, num_rows=num_rows, num_rows_factor=num_rows_factor + ) + elif quant_dtype == dtypes.i8: + M, N = x.view(-1, shape[-1]).shape + y = torch.empty((M, N), dtype=dtypes.i8, device=device) + scale = torch.empty(M, dtype=dtypes.fp32, device=device) + smooth_scale = torch.ones(N, dtype=dtypes.fp32, device=device) + smoothquant_fwd(y, x, smooth_scale, scale) + y = y.view(shape) + else: + raise ValueError(f"unsupported: {quant_dtype=}") + # print("finished per token quant hip") + return y, scale + + +def per_group_quant_hip( + x, + scale=None, + quant_dtype=dtypes.i8, + group_size=128, + transpose_scale=False, + num_rows: Optional[torch.tensor] = None, + num_rows_factor=1, +): + shape = x.shape + device = x.device + if scale is None: + scale = torch.empty( + (*shape[:-1], shape[-1] // group_size), dtype=dtypes.fp32, device=device + ) + else: + raise ValueError("unsupported: static per token quant") + assert group_size in [ + 32, + 64, + 128, + ], f"unsupported group size {group_size=}, only support [32, 64, 128]" + y = torch.empty(shape, dtype=quant_dtype, device=device) + dynamic_per_token_scaled_quant( + y, + x.view(-1, group_size), + scale, + shuffle_scale=transpose_scale, + num_rows=num_rows, + num_rows_factor=num_rows_factor, + ) + return y, scale + + +def per_1x32_f4_quant_hip( + x, + scale=None, + quant_dtype=dtypes.fp4x2, + shuffle=False, + num_rows: Optional[torch.tensor] = None, + num_rows_factor=1, +): + m, n = x.shape + assert quant_dtype == dtypes.fp4x2 + assert n % 2 == 0 + device = x.device + if scale is None: + if shuffle: + scale = ( + torch.empty( + ( + (m + 255) // 256 * 256, + (n // 32 + 7) // 8 * 8, + ), + dtype=torch.uint8, + device=device, + ) + # .fill_(0x7F) + .view(dtypes.fp8_e8m0) + ) + else: + scale = ( + torch.empty( + (m, n // 32), + dtype=torch.uint8, + device=device, + ) + # .fill_(0x7F) + .view(dtypes.fp8_e8m0) + ) + else: + raise ValueError("unsupported: static per token quant") + y = torch.empty(m, n // 2, dtype=quant_dtype, device=device) + dynamic_per_group_scaled_quant_fp4( + y, + x, + scale, + 32, + shuffle_scale=shuffle, + num_rows=num_rows, + num_rows_factor=num_rows_factor, + ) + return y, scale + + +def per_tensor_quant_hip( + x, + scale=None, + quant_dtype=dtypes.i8, + num_rows: Optional[torch.tensor] = None, + num_rows_factor=1, +): + assert num_rows is None, "num_rows is not supported for per_tensor_quant_hip" + y = torch.empty(x.shape, dtype=quant_dtype, device=x.device) + if quant_dtype in [dtypes.fp8, dtypes.i8]: + if scale is None: + scale = torch.empty(1, dtype=dtypes.fp32, device=x.device) + dynamic_per_tensor_quant(y, x, scale) + else: + static_per_tensor_quant(y, x, scale) + else: + raise ValueError(f"unsupported: {quant_dtype=}") + return y, scale.view(1) + + +def per_token_quant_triton(x, scale=None, quant_dtype=dtypes.i8): + shape = x.shape + device = x.device + y = torch.empty(shape, dtype=quant_dtype, device=device) + if scale is None: + scale = torch.empty((*shape[:-1], 1), dtype=dtypes.fp32, device=device) + triton.quant.dynamic_per_token_quant_fp8_i8(y, x.view(-1, x.shape[-1]), scale) + else: + raise ValueError("unsupported: static per token quant") + + return y, scale + + +def per_1x32_f4_quant_triton(x, scale=None, quant_dtype=dtypes.fp4x2, shuffle=False): + assert quant_dtype == dtypes.fp4x2 + # y, scale = triton.quant.dynamic_mxfp4_quant(x) + y, scale = fp4_utils.dynamic_mxfp4_quant(x, shuffle=shuffle) + return y.view(quant_dtype), scale + + +def per_tensor_quant_triton(x, scale=None, quant_dtype=dtypes.i8): + y = torch.empty(x.shape, dtype=quant_dtype, device=x.device) + x = x.view(-1, x.shape[-1]) + if scale is None: + scale = torch.zeros(1, dtype=dtypes.fp32, device=x.device) + triton.quant.dynamic_per_tensor_quant_fp8_i8(y, x, scale) + else: + triton.quant.static_per_tensor_quant_fp8_i8(y, x, scale) + return y, scale + + +@functools.lru_cache() +def get_torch_act(aType): + tmp = { + ActivationType.No: lambda *a, **k: a[0], + ActivationType.Silu: F.silu, + ActivationType.Gelu: F.gelu, + } + return tmp.get(aType, NotImplementedError) + + +@compile_ops("module_quant") +def static_per_tensor_quant(out: Tensor, input: Tensor, scale: Tensor) -> None: ... + + +@compile_ops("module_quant") +def dynamic_per_tensor_quant(out: Tensor, input: Tensor, scale: Tensor) -> None: ... + + +@compile_ops("module_quant") +def dynamic_per_token_scaled_quant( + out: torch.Tensor, + input: torch.Tensor, + scales: torch.Tensor, + scale_ub: Optional[torch.Tensor] = None, + shuffle_scale: bool = False, + num_rows: Optional[torch.Tensor] = None, + num_rows_factor: int = 1, +) -> None: ... + + +@compile_ops("module_quant") +def dynamic_per_group_scaled_quant_fp4( + out: Tensor, + input: Tensor, + scales: Tensor, + group_size: Optional[int] = 32, + shuffle_scale: bool = True, + num_rows: Optional[Tensor] = None, + num_rows_factor: int = 1, +) -> None: + """ + Only support group_size in [32, 64, 128] + """ + ... + + +@compile_ops("module_quant") +def smooth_per_token_scaled_quant( + out: torch.Tensor, + input: torch.Tensor, + scales: torch.Tensor, + smooth_scale: torch.Tensor, + smooth_scale_map: Optional[torch.Tensor] = None, + shuffle_scale: bool = False, + num_rows: Optional[torch.Tensor] = None, + num_rows_factor: int = 1, +) -> None: ... + + +@compile_ops("module_quant") +def partial_transpose( + out: Tensor, + input: Tensor, + num_rows: Tensor, +) -> None: ... diff --git a/aiter/ops/rmsnorm.py b/aiter/ops/rmsnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..ef52844b1e34f472bd54b21c4fd7929d6abf3d19 --- /dev/null +++ b/aiter/ops/rmsnorm.py @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: MIT + +import torch +from torch import Tensor +from ..jit.core import compile_ops +from typing import Optional + +MD_NAME = "module_rmsnorm" + + +@compile_ops("module_rmsnorm") +def rms_norm_cu( + out: Tensor, + input: Tensor, + weight: Tensor, + epsilon: float, +) -> None: + """ + Cuda version of rmsnorm + """ + ... + + +@compile_ops("module_rmsnorm") +def fused_add_rms_norm_cu( + input: Tensor, # input/out + residual_in: Tensor, # residual_in/out + weight: Tensor, + epsilon: float, +) -> None: + """ + Cuda version of rmsnorm fused add + """ + ... + + +def gen_rms_norm_fake_tensor( + input: Tensor, + weight: Tensor, + epsilon: float, +) -> Tensor: + return torch.empty_like(input, dtype=input.dtype, device=input.device) + + +@compile_ops( + "module_rmsnorm", fc_name="rmsnorm2d_fwd", gen_fake=gen_rms_norm_fake_tensor +) +def rms_norm( + input: Tensor, + weight: Tensor, + epsilon: float, +) -> Tensor: + """ + CK version of rmsnorm + """ + ... + + +@compile_ops("module_rmsnorm", gen_fake=gen_rms_norm_fake_tensor) +def rmsnorm2d_fwd( + input: torch.Tensor, + weight: torch.Tensor, + epsilon: float, +) -> Tensor: ... + + +@compile_ops("module_rmsnorm") +def rmsnorm2d_fwd_with_add( + out: Tensor, + input: Tensor, + residual_in: Tensor, + residual_out: Tensor, + weight: Tensor, + epsilon: float, +) -> None: ... + + +@compile_ops("module_rmsnorm") +def rmsnorm2d_fwd_with_smoothquant( + out: Tensor, + input: Tensor, + xscale: Tensor, + yscale: Tensor, + weight: Tensor, + epsilon: float, +) -> None: ... + + +@compile_ops("module_rmsnorm") +def rmsnorm2d_fwd_with_add_smoothquant( + out: Tensor, + input: Tensor, + residual_in: Tensor, + residual_out: Tensor, + xscale: Tensor, + yscale: Tensor, + weight: Tensor, + epsilon: float, + out_before_quant: Optional[Tensor] = None, +) -> None: ... + + +@compile_ops("module_rmsnorm") +def rmsnorm2d_fwd_with_dynamicquant( + out: Tensor, + input: Tensor, + yscale: Tensor, + weight: Tensor, + epsilon: float, +) -> None: ... + + +@compile_ops("module_rmsnorm") +def rmsnorm2d_fwd_with_add_dynamicquant( + out: Tensor, + input: Tensor, + residual_in: Tensor, + residual_out: Tensor, + yscale: Tensor, + weight: Tensor, + epsilon: float, +) -> None: ... diff --git a/aiter/ops/rope.py b/aiter/ops/rope.py new file mode 100644 index 0000000000000000000000000000000000000000..5a71de9a57d184a6435704091694c375c7052731 --- /dev/null +++ b/aiter/ops/rope.py @@ -0,0 +1,1306 @@ +# SPDX-License-Identifier: MIT + + +from torch import Tensor, empty, empty_like, autograd +from typing import Tuple, Union +from ..jit.core import compile_ops + + +MD_NAME = "module_rope" + + +@compile_ops("module_rope_general_fwd") +def rope_fwd_impl( + output: Tensor, + input: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of traditional RoPE (Rotary Position Embedding). + Input and output should be in "sbhd" format and freqs should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, it should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_bwd") +def rope_bwd_impl( + input_grads: Tensor, + output_grads: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Backward propagation of traditional RoPE (Rotary Position Embedding). + Input and output should be in "sbhd" format and freqs should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, it should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_fwd") +def rope_2c_fwd_impl( + output_x: Tensor, + output_y: Tensor, + input_x: Tensor, + input_y: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of traditional RoPE (Rotary Position Embedding) on two channels. + Input and output should be in "sbhd" format and freqs should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, it should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_bwd") +def rope_2c_bwd_impl( + input_grads_x: Tensor, + input_grads_y: Tensor, + output_grads_x: Tensor, + output_grads_y: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Backward propagation of traditional RoPE (Rotary Position Embedding) on two channels. + Input and output should be in "sbhd" format and freqs should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, it should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_fwd") +def rope_cached_fwd_impl( + output: Tensor, + input: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with cached cos and sin. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_bwd") +def rope_cached_bwd_impl( + input_grads: Tensor, + output_grads: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Backward propagation of RoPE (Rotary Position Embedding) with cached cos and sin. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_fwd") +def rope_cached_2c_fwd_impl( + output_x: Tensor, + output_y: Tensor, + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with cached cos and sin on two channels. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_bwd") +def rope_cached_2c_bwd_impl( + input_grads_x: Tensor, + input_grads_y: Tensor, + output_grads_x: Tensor, + output_grads_y: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Backward propagation of RoPE (Rotary Position Embedding) with cached cos and sin on two channels. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_pos_fwd") +def rope_cached_positions_fwd_impl( + output: Tensor, + input: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with cached cos and sin with positions and offsets + on one channel. Offsets here is optional. Both positions and offsets should be in [s, b]. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_pos_fwd") +def rope_cached_positions_2c_fwd_impl( + output_x: Tensor, + output_y: Tensor, + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with cached cos and sin with positions and offsets + on two channels. Offsets here is optional. Both positions and offsets should be in [s, b]. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_pos_fwd") +def rope_cached_positions_offsets_fwd_impl( + output: Tensor, + input: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + offsets: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with cached cos and sin with positions and offsets + on one channel. Offsets here is optional. Both positions and offsets should be in [s, b]. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_pos_fwd") +def rope_cached_positions_offsets_2c_fwd_impl( + output_x: Tensor, + output_y: Tensor, + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + offsets: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with cached cos and sin with positions and offsets + on two channels. Offsets here is optional. Both positions and offsets should be in [s, b]. + Input and output should be in "sbhd" format, and cos and sin should be in shape of [s, 1, 1, d // 2] + if reuse_freqs_front_part is true. Otherwise, they should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_fwd") +def rope_thd_fwd_impl( + output: Tensor, + input: Tensor, + cu_seqlens: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with input sizes: (t, h, d). + where t is cumulative sum of sequence lengths. + Freqs should be in shape of [s, 1, 1, d // 2] if reuse_freqs_front_part is true. Otherwise, + it should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_bwd") +def rope_thd_bwd_impl( + input_grads: Tensor, + output_grads: Tensor, + cu_seqlens: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Backward propagation of RoPE (Rotary Position Embedding) with input sizes: (t, h, d). + where t is cumulative sum of sequence lengths. + Freqs should be in shape of [s, 1, 1, d // 2] if reuse_freqs_front_part is true. Otherwise, + it should be in [s, 1, 1, d]. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_fwd") +def rope_2d_fwd_impl( + output: Tensor, + input: Tensor, + cos_h: Tensor, + sin_h: Tensor, + cos_w: Tensor, + sin_w: Tensor, + img_height: int, + img_width: int, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Forward propagation of RoPE (Rotary Position Embedding) with 2D image as input. + Input and output should be in (b, s, h, d) where s = H * W. + cos_h and sin_h are in (1, H', 1, h, d // 4) if reuse_freqs_front_part is true. Otherwise, + it should be in (1, H', 1, h, d // 2) where H' >= H. + cos_w and sin_w are in (1, 1, W', h, d // 2) if reuse_freqs_front_part is true. Otherwise, + it should be in (1, 1, W', h, d // 2) where W' >= W. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +@compile_ops("module_rope_general_bwd") +def rope_2d_bwd_impl( + input_grads: Tensor, + output_grads: Tensor, + cos_h: Tensor, + sin_h: Tensor, + cos_w: Tensor, + sin_w: Tensor, + img_height: int, + img_width: int, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> None: + """ + Backward propagation of RoPE (Rotary Position Embedding) with 2D image as input. + output_grads and input_grads should be in (b, s, h, d) where s = H * W. + cos_h and sin_h are in (1, H', 1, h, d // 4) if reuse_freqs_front_part is true. Otherwise, + it should be in (1, H', 1, h, d // 2) where H' >= H. + cos_w and sin_w are in (1, 1, W', h, d // 2) if reuse_freqs_front_part is true. Otherwise, + it should be in (1, 1, W', h, d // 2) where W' >= W. + rotate_style: 0 - NEOX style which rotates the 2nd half of elements, 1 - GPT-J style which rotates odd part. + When rotate dim is smaller than d, front part is just copied if nope_first is true, or later part is copied + if nope_first is false. Rotate dim is freqs/cos/sin.shape[-1] * 2 if reuse_freqs_front_part else 1. + """ + ... + + +def rope_fwd( + input: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h, d = input.shape + output = ( + empty( + (b, s, h, d), dtype=input.dtype, device=input.device, requires_grad=False + ).transpose(0, 1) + if transpose_output + else empty_like(input, requires_grad=False) + ) + rope_fwd_impl( + output, input, freqs, rotate_style, reuse_freqs_front_part, nope_first + ) + return output + + +def rope_fwd_inplace( + input: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_fwd_impl(input, input, freqs, rotate_style, reuse_freqs_front_part, nope_first) + + +def rope_bwd( + output_grads: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h, d = output_grads.shape + input_grads = ( + empty( + (b, s, h, d), + dtype=output_grads.dtype, + device=output_grads.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(output_grads, requires_grad=False) + ) + rope_bwd_impl( + input_grads, + output_grads, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return input_grads + + +def rope_2c_fwd( + input_x: Tensor, + input_y: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h_x, d = input_x.shape + h_y = input_y.shape[2] + output_x = ( + empty( + (b, s, h_x, d), + dtype=input_x.dtype, + device=input_x.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_x, requires_grad=False) + ) + output_y = ( + empty( + (b, s, h_y, d), + dtype=input_y.dtype, + device=input_y.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_y, requires_grad=False) + ) + rope_2c_fwd_impl( + output_x, + output_y, + input_x, + input_y, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output_x, output_y + + +def rope_2c_fwd_inplace( + input_x: Tensor, + input_y: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_2c_fwd_impl( + input_x, + input_y, + input_x, + input_y, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_2c_bwd( + output_grads_x: Tensor, + output_grads_y: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h_x, d = output_grads_x.shape + h_y = output_grads_y.shape[2] + input_grads_x = ( + empty( + (b, s, h_x, d), + dtype=output_grads_x.dtype, + device=output_grads_x.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(output_grads_x, requires_grad=False) + ) + input_grads_y = ( + empty( + (b, s, h_y, d), + dtype=output_grads_y.dtype, + device=output_grads_y.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(output_grads_y, requires_grad=False) + ) + rope_2c_bwd_impl( + input_grads_x, + input_grads_y, + output_grads_x, + output_grads_y, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return input_grads_x, input_grads_y + + +def rope_cached_fwd( + input: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h, d = input.shape + output = ( + empty( + (b, s, h, d), dtype=input.dtype, device=input.device, requires_grad=False + ).transpose(0, 1) + if transpose_output + else empty_like(input, requires_grad=False) + ) + rope_cached_fwd_impl( + output, input, cos, sin, rotate_style, reuse_freqs_front_part, nope_first + ) + return output + + +def rope_cached_fwd_inplace( + input: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_cached_fwd_impl( + input, input, cos, sin, rotate_style, reuse_freqs_front_part, nope_first + ) + + +def rope_cached_bwd( + output_grads: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h, d = output_grads.shape + input_grads = ( + empty( + (b, s, h, d), + dtype=output_grads.dtype, + device=output_grads.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(output_grads, requires_grad=False) + ) + rope_cached_bwd_impl( + input_grads, + output_grads, + cos, + sin, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return input_grads + + +def rope_cached_2c_fwd( + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h_x, d = input_x.shape + h_y = input_y.shape[2] + output_x = ( + empty( + (b, s, h_x, d), + dtype=input_x.dtype, + device=input_x.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_x, requires_grad=False) + ) + output_y = ( + empty( + (b, s, h_y, d), + dtype=input_y.dtype, + device=input_y.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_y, requires_grad=False) + ) + rope_cached_2c_fwd_impl( + output_x, + output_y, + input_x, + input_y, + cos, + sin, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output_x, output_y + + +def rope_cached_2c_fwd_inplace( + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_cached_2c_fwd_impl( + input_x, + input_y, + input_x, + input_y, + cos, + sin, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_cached_2c_bwd( + output_grads_x: Tensor, + output_grads_y: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h_x, d = output_grads_x.shape + h_y = output_grads_y.shape[2] + input_grads_x = ( + empty( + (b, s, h_x, d), + dtype=output_grads_x.dtype, + device=output_grads_x.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(output_grads_x, requires_grad=False) + ) + input_grads_y = ( + empty( + (b, s, h_y, d), + dtype=output_grads_y.dtype, + device=output_grads_y.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(output_grads_y, requires_grad=False) + ) + rope_cached_2c_bwd_impl( + input_grads_x, + input_grads_y, + output_grads_x, + output_grads_y, + cos, + sin, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return input_grads_x, input_grads_y + + +def rope_cached_positions_fwd( + input: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h, d = input.shape + output = ( + empty( + (b, s, h, d), dtype=input.dtype, device=input.device, requires_grad=False + ).transpose(0, 1) + if transpose_output + else empty_like(input, requires_grad=False) + ) + rope_cached_positions_fwd_impl( + output, + input, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output + + +def rope_cached_positions_2c_fwd( + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h_x, d = input_x.shape + h_y = input_y.shape[2] + output_x = ( + empty( + (b, s, h_x, d), + dtype=input_x.dtype, + device=input_x.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_x, requires_grad=False) + ) + output_y = ( + empty( + (b, s, h_y, d), + dtype=input_y.dtype, + device=input_y.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_y, requires_grad=False) + ) + rope_cached_positions_2c_fwd_impl( + output_x, + output_y, + input_x, + input_y, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output_x, output_y + + +def rope_cached_positions_fwd_inplace( + input: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_cached_positions_fwd_impl( + input, + input, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_cached_positions_2c_fwd_inplace( + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_cached_positions_2c_fwd_impl( + input_x, + input_y, + input_x, + input_y, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_cached_positions_offsets_fwd( + input: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + offsets: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h, d = input.shape + output = ( + empty( + (b, s, h, d), dtype=input.dtype, device=input.device, requires_grad=False + ).transpose(0, 1) + if transpose_output + else empty_like(input, requires_grad=False) + ) + rope_cached_positions_offsets_fwd_impl( + output, + input, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output + + +def rope_cached_positions_offsets_2c_fwd( + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + offsets: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> Tensor: + s, b, h_x, d = input_x.shape + h_y = input_y.shape[2] + output_x = ( + empty( + (b, s, h_x, d), + dtype=input_x.dtype, + device=input_x.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_x, requires_grad=False) + ) + output_y = ( + empty( + (b, s, h_y, d), + dtype=input_y.dtype, + device=input_y.device, + requires_grad=False, + ).transpose(0, 1) + if transpose_output + else empty_like(input_y, requires_grad=False) + ) + rope_cached_positions_offsets_2c_fwd_impl( + output_x, + output_y, + input_x, + input_y, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output_x, output_y + + +def rope_cached_positions_offsets_fwd_inplace( + input: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + offsets: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_cached_positions_offsets_fwd_impl( + input, + input, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_cached_positions_offsets_2c_fwd_inplace( + input_x: Tensor, + input_y: Tensor, + cos: Tensor, + sin: Tensor, + positions: Tensor, + offsets: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_cached_positions_offsets_2c_fwd_impl( + input_x, + input_y, + input_x, + input_y, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_thd_fwd( + input: Tensor, + cu_seqlens: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + output = empty_like(input, requires_grad=False) + rope_thd_fwd_impl( + output, + input, + cu_seqlens, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output + + +def rope_thd_fwd_inplace( + input: Tensor, + cu_seqlens: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_thd_fwd_impl( + input, + input, + cu_seqlens, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_thd_bwd( + output_grads: Tensor, + cu_seqlens: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + input_grads = empty_like(output_grads, requires_grad=False) + rope_thd_bwd_impl( + input_grads, + output_grads, + cu_seqlens, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return input_grads + + +def rope_2d_fwd( + input: Tensor, + cos_h: Tensor, + sin_h: Tensor, + cos_w: Tensor, + sin_w: Tensor, + img_height: int, + img_width: int, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + output = empty_like(input, requires_grad=False) + rope_2d_fwd_impl( + output, + input, + cos_h, + sin_h, + cos_w, + sin_w, + img_height, + img_width, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return output + + +def rope_2d_fwd_inplace( + input: Tensor, + cos_h: Tensor, + sin_h: Tensor, + cos_w: Tensor, + sin_w: Tensor, + img_height: int, + img_width: int, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + rope_2d_fwd_impl( + input, + input, + cos_h, + sin_h, + cos_w, + sin_w, + img_height, + img_width, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + +def rope_2d_bwd( + output_grads: Tensor, + cos_h: Tensor, + sin_h: Tensor, + cos_w: Tensor, + sin_w: Tensor, + img_height: int, + img_width: int, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, +) -> Tensor: + input_grads = empty_like(output_grads, requires_grad=False) + rope_2d_bwd_impl( + input_grads, + output_grads, + cos_h, + sin_h, + cos_w, + sin_w, + img_height, + img_width, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + return input_grads + + +class RoPE(autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, + ) -> Tensor: + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.transpose_output = transpose_output + ctx.save_for_backward(freqs) + return rope_fwd( + x, freqs, rotate_style, reuse_freqs_front_part, nope_first, transpose_output + ) + + @staticmethod + def backward(ctx, output_grads: Tensor) -> Tuple[Union[Tensor, None], ...]: + (freqs,) = ctx.saved_tensors + return ( + rope_bwd( + output_grads, + freqs, + ctx.rotate_style, + ctx.reuse_freqs_front_part, + ctx.nope_first, + ctx.transpose_output, + ), + None, + None, + ) + + +class RoPECached(autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + cos: Tensor, + sin: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, + ) -> Tensor: + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.transpose_output = transpose_output + ctx.save_for_backward(cos, sin) + return rope_cached_fwd( + x, + cos, + sin, + rotate_style, + reuse_freqs_front_part, + nope_first, + transpose_output, + ) + + @staticmethod + def backward(ctx, output_grads) -> Tuple[Union[Tensor, None], ...]: + cos, sin = ctx.saved_tensors + return ( + rope_cached_bwd( + output_grads, + cos, + sin, + ctx.rotate_style, + ctx.reuse_freqs_front_part, + ctx.nope_first, + ctx.transpose_output, + ), + None, + None, + ) + + +class RoPETHD(autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + cu_seqlens: Tensor, + freqs: Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + ): + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.save_for_backward(cu_seqlens, freqs) + return rope_thd_fwd( + x, cu_seqlens, freqs, rotate_style, reuse_freqs_front_part, nope_first + ) + + @staticmethod + def backward(ctx, output_grads) -> Tuple[Union[Tensor, None], ...]: + cu_seqlens, freqs = ctx.saved_tensors + return ( + rope_thd_bwd( + output_grads, + cu_seqlens, + freqs, + ctx.rotate_style, + ctx.reuse_freqs_front_part, + ctx.nope_first, + ), + None, + None, + ) + + +class RoPE2D(autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + cos_height: Tensor, + sin_height: Tensor, + cos_width: Tensor, + sin_width: Tensor, + img_height: int, + img_width: int, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + ) -> Tensor: + ctx.img_height = img_height + ctx.img_width = img_width + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.save_for_backward(cos_height, sin_height, cos_width, sin_width) + return rope_2d_fwd( + x, + cos_height, + sin_height, + cos_width, + sin_width, + img_height, + img_width, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) + + @staticmethod + def backward(ctx, output_grads) -> Tuple[Union[Tensor, None], ...]: + cos_height, sin_height, cos_width, sin_width = ctx.saved_tensors + return ( + rope_2d_bwd( + output_grads, + cos_height, + sin_height, + cos_width, + sin_width, + ctx.img_height, + ctx.img_height, + ctx.rotate_style, + ctx.reuse_freqs_front_part, + ctx.nope_first, + ), + None, + None, + ) diff --git a/aiter/ops/shuffle.py b/aiter/ops/shuffle.py new file mode 100644 index 0000000000000000000000000000000000000000..4a3051a75bedcfed98c2c99e3639350476e172bd --- /dev/null +++ b/aiter/ops/shuffle.py @@ -0,0 +1,342 @@ +# SPDX-License-Identifier: MIT +import torch +import numpy as np + +# Moe_c Shuffle Function +#================================================================================================================= +def moe_layout_shuffle_gemm1(weight): + + return _w8a8_marlin_weight_1(weight) + +def moe_layout_shuffle_gemm2(weight): + + return _w8a8_marlin_weight_2(weight) + +def w4a8_moe_layout_shuffle_gemm1(weight): + + return _w4a8_gemm1_weight_shuffle(weight) + +def w4a8_moe_layout_shuffle_gemm2(weight): + + return _w4a8_gemm2_weight_shuffle(weight) + +#w4a16 +def w4a16_marlin_weight_1(weight_input # [size_n, size_k// 2 ] + ): + w1_qweight = weight_input + e,n,k=w1_qweight.shape + k = k * 2 + w1_qweight_uint32 = w1_qweight.view(-1).view(torch.uint32) + new_shape = (e, n // 16, 16, k // 32, 4) # uint32张量的形状 + w1_qweight_uint32_reshaped = w1_qweight_uint32.view(new_shape) + w1_qweight_uint32_transposed = w1_qweight_uint32_reshaped.transpose(2, 3).contiguous() + new_shape = (e, n // 16, k // 128, 4, 16, 4) + w1_new_trans = w1_qweight_uint32_transposed.view(new_shape) + w1_qweight_shuffle = w1_new_trans.transpose(1, 2).contiguous() + + return w1_qweight_shuffle + +def w4a16_marlin_weight_2(weight_input # [size_n, size_k// 2 ] + ): + w2_qweight = weight_input + e,k,n=w2_qweight.shape + n = n * 2 + w2_qweight_uint32 = w2_qweight.view(-1).view(torch.uint32) + new_shape = (e, k // 16, 16, n // 32, 4) # uint32张量的形状 + w2_qweight_uint32_reshaped = w2_qweight_uint32.view(new_shape) + w2_qweight_uint32_transposed = w2_qweight_uint32_reshaped.transpose(2, 3).contiguous() + new_shape = (e, k // 16, n // 128, 4, 16, 4) + w2_new_trans = w2_qweight_uint32_transposed.view(new_shape) + w2_qweight_shuffle = w2_new_trans.transpose(1, 2).contiguous() + + return w2_qweight_shuffle + +#w8a8 +def _w8a8_marlin_weight_1(weight_input # [size_n, size_k// 2 ] + ): + weight = weight_input + weight = weight.permute(0,2,1) + marlin_q_w = _marlin_weights(weight, k_tile=64, n_tile=16, pack_factor=8) + return marlin_q_w + +def _w8a8_marlin_weight_2(weight_input # [size_n, size_k// 2 ] + ): + weight = weight_input + weight = weight.permute(0,2,1) + marlin_q_w = _marlin_weights_2(weight, k_tile=64, n_tile=16, pack_factor=8) + return marlin_q_w + + + +def _marlin_weights( + q_w, + k_tile=64, + n_tile=16, + pack_factor=8): + # 7168, 256 + e,size_k, size_n = q_w.shape + q_w = q_w.reshape(e,size_k // k_tile, k_tile, size_n ) + + + q_w = q_w.permute(0,1,3,2).contiguous() + q_w = q_w.reshape(e,size_k // k_tile, size_n * k_tile) + + + + return q_w + + +def _marlin_weights_2( + q_w, + k_tile=64, + n_tile=16, + pack_factor=8): + # 128 7168 + e, size_k, size_n = q_w.shape + q_w = q_w.reshape(e,size_k // k_tile, k_tile, size_n //n_tile , n_tile ) + q_w = q_w.permute((0,1, 3, 4, 2)).contiguous() + q_w = q_w.reshape(e, size_k // k_tile , size_n //n_tile , n_tile // 16 , 16, k_tile // 16 , 16 ) + q_w = q_w.permute(0,1,2,3,5,4,6).contiguous() + + return q_w + +# w4a8 + + +def _w4a8_gemm1_weight_shuffle(w4a8_w): + + full_w4a8_w = w4a8_w + full_w4a8_w = full_w4a8_w.T + k_tile=32 + n_tile=256 + size_k, size_n = full_w4a8_w.shape + full_w4a8_w = full_w4a8_w.reshape(size_k // k_tile, k_tile, size_n //n_tile , n_tile ) + full_w4a8_w = full_w4a8_w.permute((0, 2, 3, 1)).contiguous() + full_w4a8_w = full_w4a8_w.reshape(size_k // k_tile , size_n //n_tile , n_tile // 32 , 32, k_tile // 8 , 8 ) + full_w4a8_w = full_w4a8_w.permute(0,1,2,4,3,5).contiguous() + + return full_w4a8_w + +def _w4a8_gemm2_weight_shuffle(w4a8_w): + full_w4a8_w = w4a8_w + full_w4a8_w = full_w4a8_w.T + k_tile=32 + n_tile=256 + size_k, size_n = full_w4a8_w.shape + full_w4a8_w = full_w4a8_w.reshape(size_k // k_tile, k_tile, size_n //n_tile , n_tile ) + full_w4a8_w = full_w4a8_w.permute((0, 2, 3, 1)).contiguous() + full_w4a8_w = full_w4a8_w.reshape(size_k // k_tile , size_n //n_tile , n_tile // 32 , 32, k_tile // 8 , 8 ) + full_w4a8_w = full_w4a8_w.permute(0,1,2,4,3,5).contiguous() + + return full_w4a8_w + +#=======================================================Moe_c Shuffle Function================================================================ + +def asm_shuffle_weight_b8(x: torch.Tensor, stage: torch.int32 = 1) -> torch.Tensor: + # Hardcode BLOCK_K and BLOCK_N + assert x.dtype in [ + torch.float32, torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn + ] + + if x.dtype == torch.int8 or x.dtype == torch.float8_e4m3fn: + N = 16 + K = 16 + IK = 64 + IN = 64 + BK = 256 + BN = 128 + if stage == 1: + if x.shape[-2] % 128 != 0 and x.shape[-2] % 64 == 0: + BN = 64 + if stage == 2: + if x.shape[-1] % 128 == 0: + BK = 128 + elif x.shape[-1] % 128 == 64: + BN = 64 + BK = 64 + elif x.shape[-1] % 128 == 96: + BN = 64 + BK = 64 + assert x.shape[-2] % BN == 0, f"{x.shape[-2]} % {BN} == {x.shape[-2] % BN }" + x_ = x + multiple = x.shape[-1] // BK * BK + part1 = x[:, :, :multiple] + ### part1 shuffle + # 0, 1, 2, 3, 4, 5, 6, 7, 8 + part1 = part1.view(-1, part1.shape[-2] // BN, BN // IN, IN // N, N, part1.shape[-1] // BK, BK // IK, IK // K, K) + part1 = part1.permute(0, 1, 5, 2, 6, 3, 7, 4, 8).contiguous() + part1 = part1.flatten(start_dim=1) + ### part2 shuffle + part2 = x[:, :, multiple:] + IK = 32 + BK = 32 + # 0, 1, 2, 3, 4, 5, 6, 7, 8 + part2 = part2.view(-1, part2.shape[-2] // BN, BN // IN, IN // N, N, part2.shape[-1] // BK, BK // IK, IK // K, K) + part2 = part2.permute(0, 1, 5, 2, 6, 3, 7, 4, 8).contiguous() + part2 = part2.flatten(start_dim=1) + ### combine + x_ = torch.cat((part1, part2), dim=1) + x_ = x_.view(*x.shape) + return x_ + + elif x.dtype == torch.float16 or x.dtype == torch.bfloat16: + N = 16 + K = 8 + IK = 32 + IN = 64 + BK = 128 + BN = 64 + if stage == 2: + BK = 32 + else: + assert False, f"not support {x.dtype}" + + assert x.shape[-2] % BN == 0, f"{x.shape[-2]} % {BN} == {x.shape[-2] % BN }" + assert x.shape[-1] % BK == 0, f"{x.shape[-1]} % {BK} == {x.shape[-1] % BK }" + + x_ = x + # 0, 1, 2, 3, 4, 5, 6, 7, 8 + x_ = x_.view(-1, x.shape[-2] // BN, BN // IN, IN // N, N, x.shape[-1] // BK, BK // IK, IK // K, K) + x_ = x_.permute(0, 1, 5, 2, 6, 3, 7, 4, 8) + x_ = x_.contiguous() + x_ = x_.view(*x.shape) + return x_ + +def shuffle_weight(x: torch.Tensor, layout=(16, 16), use_int4=False) -> torch.Tensor: + # Hardcode BLOCK_K and BLOCK_N + IN, IK = layout + BK = IK * 2 + K = 16 // x.element_size() if not use_int4 else 32 + BN = IN + assert x.shape[-2] % BN == 0, f"{x.shape[-2]} % {BN} == {x.shape[-2] % BN }" + assert x.shape[-1] % BK == 0, f"{x.shape[-1]} % {BK} == {x.shape[-1] % BK }" + + x_ = x + x_ = x_.view(-1, x.shape[-2] // BN, BN, x.shape[-1] // BK, BK // K, K) + x_ = x_.permute(0, 1, 3, 4, 2, 5) + x_ = x_.contiguous() + x_ = x_.view(*x.shape) + return x_ + +# TN Layout in -> CK Tiling Layout out +# layout(NWaves, NRepeat, NLane, NInterleave, NVec, KWaves, KRepeat, KLane, KVec) +def ck_shuffle_weight(x:torch.Tensor, layout=(4, 1, 16, 2, 1, 1, 4, 4, 8)) -> torch.Tensor: + NWaves, NRepeat, NLane, NInterleave, NVec, KWaves, KRepeat, KLane, KVec = layout + Block_N = NWaves * NRepeat * NLane * NInterleave * NVec + Block_K = KWaves * KRepeat * KLane * KVec + assert x.shape[-2] % Block_N == 0, f"{x.shape[-2]} % {Block_N} == {x.shape[-2] % Block_N }" + assert x.shape[-1] % Block_K == 0, f"{x.shape[-1]} % {Block_K} == {x.shape[-1] % Block_K }" + + x_ = x + # (0, 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11) + x_ = x_.view(-1, x.shape[-2] // Block_N, NWaves, NRepeat, NLane, NInterleave, NVec, x.shape[-1] // Block_K, KWaves, KRepeat, KLane, KVec) + # x_ = x_.permute(0, 1, 7, 3, 5, 9, 8, 2, 10, 4, 6, 11) + x_ = x_.permute(0, 1, 7, 2, 8, 3, 5, 9, 10, 4, 6, 11) + x_ = x_.contiguous() + x_ = x_.view(-1, x.shape[-2] // Block_N, x.shape[-1] // Block_K, Block_N * Block_K) + return x_ + +# layout(NWaves, NRepeat, NLane, NInterleave, NVec, KWaves, KRepeat, KLane, KVec) +def ck_shuffle_weight_down(x:torch.Tensor, layout=(4, 2, 16, 1, 1, 1, 4, 4, 8)) -> torch.Tensor: + NWaves, NRepeat, NLane, NInterleave, NVec, KWaves, KRepeat, KLane, KVec = layout + Block_N = NWaves * NRepeat * NLane * NInterleave * NVec + Block_K = KWaves * KRepeat * KLane * KVec + assert x.shape[-2] % Block_N == 0, f"{x.shape[-2]} % {Block_N} == {x.shape[-2] % Block_N }" + assert x.shape[-1] % Block_K == 0, f"{x.shape[-1]} % {Block_K} == {x.shape[-1] % Block_K }" + + x_ = x + # (0, 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11) + x_ = x_.view(-1, x.shape[-2] // Block_N, NWaves, NRepeat, NLane, NInterleave, NVec, x.shape[-1] // Block_K, KWaves, KRepeat, KLane, KVec) + x_ = x_.permute(0, 7, 1, 2, 8, 3, 5, 9, 10, 4, 6, 11) #down weight loop in N dim + x_ = x_.contiguous() + x_ = x_.view(-1, x.shape[-1] // Block_K, x.shape[-2] // Block_N, Block_N * Block_K) + return x_ + +def reverse_awq_order(tensor: torch.Tensor) -> torch.Tensor: + """Reverse the AWQ order of the given tensor. + + Args: + tensor: Input tensor to reorder + + Returns: + Reordered tensor with bits masked to 4 bits + """ + bits = 4 + AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] + reverse_order_tensor = torch.arange( + tensor.shape[-1], + dtype=torch.int32, + device=tensor.device, + ) + reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits) + reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER] + reverse_order_tensor = reverse_order_tensor.view(-1) + + tensor = tensor[:, reverse_order_tensor] & 0xF + return tensor + +def awq_reorder_and_repack( + qweight: torch.Tensor, + qzeros: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Reorder and pack weights and zeros using AWQ order. + + This function unpacks the 4-bit quantized weights and zeros from int32, + applies reverse_awq_order to reorder them, and then packs them. + For weight, repack to [N, K//2] + For zeros, repack to [K//G, N//2] + Args: + qweight: Quantized weight tensor of shape [K, N // 8] with dtype int32 + qzeros: Quantized zero points tensor of shape [K // G, N // 8] with dtype int32 + + Returns: + Tuple of (reordered_qweight, reordered_qzeros) both with dtype int8 + """ + bits = 4 + shifts = torch.arange(0, 32, bits, device=qweight.device) + K = qweight.shape[0] + N = qweight.shape[1] * 8 + G = K // qzeros.shape[0] + + # Unpack weights: [K, N//8] -> [K, N//8, 8] -> [K, N] + iweights = torch.bitwise_right_shift( + qweight[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + iweights = iweights.view(K, -1) + + # Unpack zeros: [K//G, N//8] -> [K//G, N//8, 8] -> [K//G, N] + zeros = torch.bitwise_right_shift( + qzeros[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + zeros = zeros.view(K//G, -1) + + # Apply reverse AWQ order to both tensors + iweights = reverse_awq_order(iweights) + zeros = reverse_awq_order(zeros) + + # Mask to 4 bits + iweights = torch.bitwise_and(iweights, (2**bits) - 1) + zeros = torch.bitwise_and(zeros, (2**bits) - 1) + + # Repack weight to int32 and pack along the K direction + # [K, N] -> [N, K] + iweights = iweights.transpose(1, 0).contiguous() + # Reshape to [N, K//2, 2] for weights + iweights_packed = iweights.view(N, -1, 2) + + # Repack zeros to int8 and pack along the N direction + # Reshape to [K//G, N//2, 2] for zeros + zeros_packed = zeros.view(K//G, -1, 2) + + # Pack 2 int4 values into int8 using bit shifts + # Direct packing: pack in the order they appear after reordering + packed_weights = torch.zeros([N, K//2], dtype=torch.int8, device=qweight.device) + packed_zeros = torch.zeros([K//G, N//2], dtype=torch.int8, device=zeros.device) + + for i in range(2): + packed_weights |= (iweights_packed[:, :, i].to(torch.int8) << (i * bits)) + packed_zeros |= (zeros_packed[:, :, i].to(torch.int8) << (i * bits)) + + return packed_weights, packed_zeros diff --git a/aiter/ops/tilelang/__init__.py b/aiter/ops/tilelang/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ccdf5c7c5b14d375c788cc32a0db10ca889ce02 --- /dev/null +++ b/aiter/ops/tilelang/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: MIT + +from .sparse_mla_fwd import tilelang_sparse_fwd, ref_sparse_mla_fwd_interface + +__all__ = ["tilelang_sparse_fwd", "ref_sparse_mla_fwd_interface"] + diff --git a/aiter/ops/tilelang/configs/fp8_index/fp8_index_tuned_config_h32_d128_cu72.py b/aiter/ops/tilelang/configs/fp8_index/fp8_index_tuned_config_h32_d128_cu72.py new file mode 100644 index 0000000000000000000000000000000000000000..2026712bdbdea27431443219f9dd8e62e986310f --- /dev/null +++ b/aiter/ops/tilelang/configs/fp8_index/fp8_index_tuned_config_h32_d128_cu72.py @@ -0,0 +1,673 @@ +# Auto-generated by tune_fp8_index.py. Do not edit manually. + +from typing import Tuple, Dict +import bisect + +M_REPR_TABLE = [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048, 2560, 3072, 3584, 4096] +N_REPR_TABLE = [64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, 40960, 49152, 57344, 65536, 73728, 81920, 90112, 98304, 106496, 114688, 122880, 131072] + +CONFIG_MAP: Dict[Tuple[int, int], Tuple[int, int, int]] = { + (1, 64): (1, 128, 64), + (1, 128): (1, 128, 64), + (1, 256): (1, 128, 64), + (1, 512): (1, 128, 64), + (1, 768): (1, 128, 64), + (1, 1024): (1, 128, 64), + (1, 1536): (1, 128, 64), + (1, 2048): (1, 128, 64), + (1, 3072): (1, 128, 64), + (1, 4096): (1, 128, 64), + (1, 6144): (1, 128, 64), + (1, 8192): (1, 128, 64), + (1, 12288): (1, 128, 64), + (1, 16384): (1, 128, 64), + (1, 24576): (1, 128, 64), + (1, 32768): (1, 128, 64), + (1, 40960): (1, 128, 64), + (1, 49152): (1, 128, 64), + (1, 57344): (1, 128, 64), + (1, 65536): (1, 128, 64), + (1, 73728): (1, 128, 64), + (1, 81920): (1, 128, 64), + (1, 90112): (1, 128, 64), + (1, 98304): (1, 128, 64), + (1, 106496): (1, 128, 64), + (1, 114688): (1, 128, 64), + (1, 122880): (1, 128, 64), + (1, 131072): (1, 128, 64), + (2, 64): (2, 64, 64), + (2, 128): (2, 64, 64), + (2, 256): (2, 64, 64), + (2, 512): (2, 64, 64), + (2, 768): (2, 64, 64), + (2, 1024): (2, 64, 64), + (2, 1536): (2, 64, 64), + (2, 2048): (2, 64, 64), + (2, 3072): (2, 64, 64), + (2, 4096): (2, 64, 64), + (2, 6144): (2, 128, 64), + (2, 8192): (2, 128, 64), + (2, 12288): (2, 256, 64), + (2, 16384): (2, 256, 64), + (2, 24576): (2, 256, 64), + (2, 32768): (2, 256, 64), + (2, 40960): (2, 256, 64), + (2, 49152): (2, 256, 64), + (2, 57344): (2, 256, 64), + (2, 65536): (2, 256, 64), + (2, 73728): (2, 256, 64), + (2, 81920): (2, 512, 128), + (2, 90112): (2, 512, 128), + (2, 98304): (2, 512, 128), + (2, 106496): (2, 512, 128), + (2, 114688): (2, 512, 128), + (2, 122880): (2, 512, 128), + (2, 131072): (2, 512, 128), + (4, 64): (2, 64, 64), + (4, 128): (2, 64, 64), + (4, 256): (2, 64, 64), + (4, 512): (2, 64, 64), + (4, 768): (2, 64, 64), + (4, 1024): (2, 64, 64), + (4, 1536): (2, 64, 64), + (4, 2048): (2, 64, 64), + (4, 3072): (2, 64, 64), + (4, 4096): (2, 64, 64), + (4, 6144): (4, 128, 64), + (4, 8192): (4, 128, 64), + (4, 12288): (2, 256, 64), + (4, 16384): (4, 128, 64), + (4, 24576): (2, 256, 64), + (4, 32768): (2, 256, 64), + (4, 40960): (4, 256, 64), + (4, 49152): (4, 256, 64), + (4, 57344): (4, 256, 64), + (4, 65536): (4, 256, 64), + (4, 73728): (4, 256, 64), + (4, 81920): (4, 512, 128), + (4, 90112): (4, 512, 128), + (4, 98304): (4, 512, 128), + (4, 106496): (4, 512, 128), + (4, 114688): (4, 512, 128), + (4, 122880): (4, 512, 128), + (4, 131072): (4, 512, 128), + (8, 64): (4, 64, 64), + (8, 128): (2, 64, 64), + (8, 256): (2, 64, 64), + (8, 512): (4, 64, 64), + (8, 768): (2, 64, 64), + (8, 1024): (2, 64, 64), + (8, 1536): (2, 64, 64), + (8, 2048): (2, 64, 64), + (8, 3072): (4, 128, 64), + (8, 4096): (4, 128, 64), + (8, 6144): (4, 128, 64), + (8, 8192): (4, 128, 64), + (8, 12288): (2, 256, 64), + (8, 16384): (2, 256, 64), + (8, 24576): (4, 128, 64), + (8, 32768): (4, 256, 64), + (8, 40960): (4, 256, 64), + (8, 49152): (4, 512, 128), + (8, 57344): (4, 256, 64), + (8, 65536): (4, 512, 128), + (8, 73728): (4, 512, 128), + (8, 81920): (4, 512, 128), + (8, 90112): (4, 512, 128), + (8, 98304): (8, 512, 64), + (8, 106496): (8, 512, 64), + (8, 114688): (4, 1024, 128), + (8, 122880): (4, 512, 128), + (8, 131072): (8, 512, 64), + (16, 64): (2, 64, 64), + (16, 128): (2, 64, 64), + (16, 256): (2, 64, 64), + (16, 512): (2, 64, 64), + (16, 768): (2, 64, 64), + (16, 1024): (2, 64, 64), + (16, 1536): (4, 128, 64), + (16, 2048): (4, 128, 64), + (16, 3072): (2, 256, 64), + (16, 4096): (4, 128, 64), + (16, 6144): (2, 256, 64), + (16, 8192): (2, 256, 64), + (16, 12288): (4, 128, 64), + (16, 16384): (4, 256, 64), + (16, 24576): (4, 128, 64), + (16, 32768): (4, 512, 128), + (16, 40960): (4, 256, 64), + (16, 49152): (4, 256, 64), + (16, 57344): (4, 512, 128), + (16, 65536): (4, 1024, 128), + (16, 73728): (4, 1024, 128), + (16, 81920): (4, 512, 128), + (16, 90112): (4, 512, 128), + (16, 98304): (4, 512, 128), + (16, 106496): (4, 512, 128), + (16, 114688): (8, 512, 64), + (16, 122880): (8, 512, 64), + (16, 131072): (4, 2048, 128), + (32, 64): (2, 64, 64), + (32, 128): (2, 64, 64), + (32, 256): (2, 64, 64), + (32, 512): (2, 64, 64), + (32, 768): (4, 128, 64), + (32, 1024): (2, 128, 64), + (32, 1536): (4, 128, 64), + (32, 2048): (2, 256, 64), + (32, 3072): (2, 256, 64), + (32, 4096): (2, 256, 64), + (32, 6144): (4, 256, 64), + (32, 8192): (4, 128, 64), + (32, 12288): (4, 512, 128), + (32, 16384): (4, 512, 128), + (32, 24576): (4, 256, 64), + (32, 32768): (4, 512, 128), + (32, 40960): (4, 512, 128), + (32, 49152): (4, 512, 128), + (32, 57344): (4, 1024, 128), + (32, 65536): (4, 512, 128), + (32, 73728): (8, 512, 64), + (32, 81920): (8, 512, 64), + (32, 90112): (8, 512, 64), + (32, 98304): (8, 512, 64), + (32, 106496): (8, 512, 64), + (32, 114688): (8, 512, 64), + (32, 122880): (8, 1024, 64), + (32, 131072): (8, 512, 64), + (64, 64): (2, 64, 64), + (64, 128): (2, 64, 64), + (64, 256): (2, 64, 64), + (64, 512): (4, 128, 64), + (64, 768): (4, 128, 64), + (64, 1024): (2, 256, 64), + (64, 1536): (2, 256, 64), + (64, 2048): (4, 128, 64), + (64, 3072): (4, 256, 64), + (64, 4096): (4, 256, 64), + (64, 6144): (4, 128, 64), + (64, 8192): (4, 512, 128), + (64, 12288): (4, 256, 64), + (64, 16384): (4, 1024, 128), + (64, 24576): (4, 512, 128), + (64, 32768): (4, 512, 128), + (64, 40960): (4, 512, 128), + (64, 49152): (4, 1024, 128), + (64, 57344): (4, 512, 128), + (64, 65536): (4, 1024, 128), + (64, 73728): (8, 1024, 64), + (64, 81920): (8, 512, 64), + (64, 90112): (8, 1024, 64), + (64, 98304): (8, 1024, 64), + (64, 106496): (8, 1024, 64), + (64, 114688): (8, 512, 64), + (64, 122880): (8, 512, 64), + (64, 131072): (8, 512, 64), + (128, 64): (2, 64, 64), + (128, 128): (2, 64, 64), + (128, 256): (2, 128, 64), + (128, 512): (4, 128, 64), + (128, 768): (2, 256, 64), + (128, 1024): (2, 256, 64), + (128, 1536): (4, 256, 64), + (128, 2048): (4, 256, 64), + (128, 3072): (4, 128, 64), + (128, 4096): (4, 512, 128), + (128, 6144): (4, 256, 64), + (128, 8192): (4, 1024, 128), + (128, 12288): (4, 512, 128), + (128, 16384): (4, 512, 128), + (128, 24576): (4, 1024, 128), + (128, 32768): (4, 512, 128), + (128, 40960): (4, 1024, 128), + (128, 49152): (4, 2048, 128), + (128, 57344): (8, 512, 64), + (128, 65536): (8, 512, 64), + (128, 73728): (8, 1024, 64), + (128, 81920): (8, 1024, 64), + (128, 90112): (4, 4096, 128), + (128, 98304): (8, 1024, 64), + (128, 106496): (8, 1024, 64), + (128, 114688): (8, 1024, 64), + (128, 122880): (8, 1024, 64), + (128, 131072): (8, 1024, 64), + (256, 64): (2, 64, 64), + (256, 128): (4, 128, 64), + (256, 256): (4, 128, 64), + (256, 512): (2, 256, 64), + (256, 768): (4, 256, 64), + (256, 1024): (4, 256, 64), + (256, 1536): (4, 128, 64), + (256, 2048): (4, 512, 128), + (256, 3072): (4, 256, 64), + (256, 4096): (4, 128, 64), + (256, 6144): (4, 512, 128), + (256, 8192): (4, 512, 128), + (256, 12288): (4, 1024, 128), + (256, 16384): (4, 1024, 128), + (256, 24576): (4, 1024, 128), + (256, 32768): (4, 1024, 128), + (256, 40960): (4, 2048, 128), + (256, 49152): (4, 2048, 128), + (256, 57344): (8, 1024, 64), + (256, 65536): (8, 1024, 64), + (256, 73728): (8, 2048, 64), + (256, 81920): (8, 2048, 64), + (256, 90112): (8, 2048, 64), + (256, 98304): (8, 1024, 64), + (256, 106496): (8, 1024, 64), + (256, 114688): (8, 2048, 64), + (256, 122880): (8, 2048, 64), + (256, 131072): (8, 1024, 64), + (384, 64): (2, 64, 64), + (384, 128): (2, 128, 64), + (384, 256): (4, 128, 64), + (384, 512): (4, 256, 64), + (384, 768): (4, 256, 64), + (384, 1024): (4, 128, 64), + (384, 1536): (4, 512, 128), + (384, 2048): (4, 256, 64), + (384, 3072): (4, 1024, 128), + (384, 4096): (4, 512, 128), + (384, 6144): (4, 1024, 128), + (384, 8192): (4, 1024, 128), + (384, 12288): (4, 1024, 128), + (384, 16384): (4, 1024, 128), + (384, 24576): (4, 1024, 128), + (384, 32768): (4, 1024, 128), + (384, 40960): (4, 1024, 128), + (384, 49152): (8, 1024, 64), + (384, 57344): (8, 2048, 64), + (384, 65536): (8, 1024, 64), + (384, 73728): (8, 2048, 64), + (384, 81920): (8, 2048, 64), + (384, 90112): (8, 1024, 64), + (384, 98304): (8, 2048, 64), + (384, 106496): (8, 2048, 64), + (384, 114688): (8, 2048, 64), + (384, 122880): (8, 4096, 64), + (384, 131072): (8, 2048, 64), + (512, 64): (2, 64, 64), + (512, 128): (4, 128, 64), + (512, 256): (2, 256, 64), + (512, 512): (4, 256, 64), + (512, 768): (4, 128, 64), + (512, 1024): (4, 512, 128), + (512, 1536): (4, 256, 64), + (512, 2048): (4, 1024, 128), + (512, 3072): (4, 512, 128), + (512, 4096): (4, 512, 128), + (512, 6144): (4, 512, 128), + (512, 8192): (4, 1024, 128), + (512, 12288): (4, 1024, 128), + (512, 16384): (4, 1024, 128), + (512, 24576): (4, 1024, 128), + (512, 32768): (4, 1024, 128), + (512, 40960): (4, 2048, 128), + (512, 49152): (8, 1024, 64), + (512, 57344): (8, 2048, 64), + (512, 65536): (8, 1024, 64), + (512, 73728): (8, 2048, 64), + (512, 81920): (8, 2048, 64), + (512, 90112): (8, 2048, 64), + (512, 98304): (8, 2048, 64), + (512, 106496): (8, 2048, 64), + (512, 114688): (8, 4096, 64), + (512, 122880): (8, 4096, 64), + (512, 131072): (8, 2048, 64), + (640, 64): (2, 64, 64), + (640, 128): (4, 128, 64), + (640, 256): (4, 128, 64), + (640, 512): (4, 128, 64), + (640, 768): (4, 256, 64), + (640, 1024): (4, 256, 64), + (640, 1536): (4, 512, 128), + (640, 2048): (4, 512, 128), + (640, 3072): (4, 1024, 128), + (640, 4096): (4, 1024, 128), + (640, 6144): (4, 1024, 128), + (640, 8192): (4, 1024, 128), + (640, 12288): (4, 1024, 128), + (640, 16384): (4, 1024, 128), + (640, 24576): (4, 1024, 128), + (640, 32768): (4, 1024, 128), + (640, 40960): (4, 2048, 128), + (640, 49152): (8, 2048, 64), + (640, 57344): (8, 1024, 64), + (640, 65536): (8, 2048, 64), + (640, 73728): (8, 2048, 64), + (640, 81920): (8, 2048, 64), + (640, 90112): (8, 2048, 64), + (640, 98304): (8, 4096, 64), + (640, 106496): (8, 4096, 64), + (640, 114688): (8, 2048, 64), + (640, 122880): (8, 2048, 64), + (640, 131072): (8, 4096, 64), + (768, 64): (2, 64, 64), + (768, 128): (4, 128, 64), + (768, 256): (4, 256, 64), + (768, 512): (4, 128, 64), + (768, 768): (4, 256, 64), + (768, 1024): (4, 256, 64), + (768, 1536): (4, 512, 128), + (768, 2048): (4, 512, 128), + (768, 3072): (4, 1024, 128), + (768, 4096): (4, 1024, 128), + (768, 6144): (4, 1024, 128), + (768, 8192): (4, 1024, 128), + (768, 12288): (4, 1024, 128), + (768, 16384): (4, 1024, 128), + (768, 24576): (4, 1024, 128), + (768, 32768): (4, 1024, 128), + (768, 40960): (4, 2048, 128), + (768, 49152): (8, 2048, 64), + (768, 57344): (8, 2048, 64), + (768, 65536): (8, 2048, 64), + (768, 73728): (8, 2048, 64), + (768, 81920): (8, 4096, 64), + (768, 90112): (8, 2048, 64), + (768, 98304): (8, 4096, 64), + (768, 106496): (8, 4096, 64), + (768, 114688): (8, 2048, 64), + (768, 122880): (8, 4096, 64), + (768, 131072): (8, 4096, 64), + (896, 64): (8, 64, 64), + (896, 128): (4, 128, 64), + (896, 256): (4, 128, 64), + (896, 512): (4, 256, 64), + (896, 768): (4, 128, 64), + (896, 1024): (4, 256, 64), + (896, 1536): (4, 512, 128), + (896, 2048): (4, 512, 128), + (896, 3072): (4, 512, 128), + (896, 4096): (4, 1024, 128), + (896, 6144): (4, 1024, 128), + (896, 8192): (4, 1024, 128), + (896, 12288): (4, 1024, 128), + (896, 16384): (4, 2048, 128), + (896, 24576): (4, 1024, 128), + (896, 32768): (4, 1024, 128), + (896, 40960): (4, 2048, 128), + (896, 49152): (8, 1024, 64), + (896, 57344): (8, 2048, 64), + (896, 65536): (8, 2048, 64), + (896, 73728): (8, 4096, 64), + (896, 81920): (8, 2048, 64), + (896, 90112): (8, 2048, 64), + (896, 98304): (8, 2048, 64), + (896, 106496): (8, 2048, 64), + (896, 114688): (8, 4096, 64), + (896, 122880): (8, 4096, 64), + (896, 131072): (8, 4096, 64), + (1024, 64): (2, 64, 64), + (1024, 128): (4, 128, 64), + (1024, 256): (4, 256, 64), + (1024, 512): (4, 512, 128), + (1024, 768): (4, 256, 64), + (1024, 1024): (4, 1024, 128), + (1024, 1536): (4, 512, 128), + (1024, 2048): (4, 512, 128), + (1024, 3072): (4, 1024, 128), + (1024, 4096): (4, 1024, 128), + (1024, 6144): (4, 1024, 128), + (1024, 8192): (4, 1024, 128), + (1024, 12288): (4, 1024, 128), + (1024, 16384): (4, 1024, 128), + (1024, 24576): (4, 2048, 128), + (1024, 32768): (4, 1024, 128), + (1024, 40960): (4, 2048, 128), + (1024, 49152): (8, 2048, 64), + (1024, 57344): (8, 2048, 64), + (1024, 65536): (8, 2048, 64), + (1024, 73728): (8, 4096, 64), + (1024, 81920): (8, 4096, 64), + (1024, 90112): (8, 2048, 64), + (1024, 98304): (8, 4096, 64), + (1024, 106496): (8, 2048, 64), + (1024, 114688): (8, 4096, 64), + (1024, 122880): (8, 4096, 64), + (1024, 131072): (8, 4096, 64), + (1280, 64): (2, 64, 64), + (1280, 128): (4, 128, 64), + (1280, 256): (4, 128, 64), + (1280, 512): (4, 256, 64), + (1280, 768): (4, 128, 64), + (1280, 1024): (4, 512, 128), + (1280, 1536): (4, 512, 128), + (1280, 2048): (4, 512, 128), + (1280, 3072): (4, 1024, 128), + (1280, 4096): (4, 1024, 128), + (1280, 6144): (4, 1024, 128), + (1280, 8192): (4, 1024, 128), + (1280, 12288): (4, 2048, 128), + (1280, 16384): (4, 2048, 128), + (1280, 24576): (4, 1024, 128), + (1280, 32768): (4, 1024, 128), + (1280, 40960): (4, 2048, 128), + (1280, 49152): (8, 4096, 64), + (1280, 57344): (8, 2048, 64), + (1280, 65536): (8, 4096, 64), + (1280, 73728): (8, 4096, 64), + (1280, 81920): (8, 2048, 64), + (1280, 90112): (8, 4096, 64), + (1280, 98304): (8, 4096, 64), + (1280, 106496): (8, 4096, 64), + (1280, 114688): (8, 4096, 64), + (1280, 122880): (8, 4096, 64), + (1280, 131072): (8, 4096, 64), + (1536, 64): (2, 64, 64), + (1536, 128): (4, 128, 64), + (1536, 256): (4, 128, 64), + (1536, 512): (4, 256, 64), + (1536, 768): (4, 256, 64), + (1536, 1024): (4, 512, 128), + (1536, 1536): (4, 512, 128), + (1536, 2048): (4, 1024, 128), + (1536, 3072): (4, 1024, 128), + (1536, 4096): (4, 1024, 128), + (1536, 6144): (4, 1024, 128), + (1536, 8192): (4, 1024, 128), + (1536, 12288): (4, 2048, 128), + (1536, 16384): (4, 2048, 128), + (1536, 24576): (4, 2048, 128), + (1536, 32768): (4, 1024, 128), + (1536, 40960): (4, 2048, 128), + (1536, 49152): (8, 4096, 64), + (1536, 57344): (8, 2048, 64), + (1536, 65536): (8, 4096, 64), + (1536, 73728): (8, 4096, 64), + (1536, 81920): (8, 4096, 64), + (1536, 90112): (8, 4096, 64), + (1536, 98304): (8, 4096, 64), + (1536, 106496): (8, 4096, 64), + (1536, 114688): (8, 4096, 64), + (1536, 122880): (8, 8192, 64), + (1536, 131072): (8, 4096, 64), + (1792, 64): (2, 64, 64), + (1792, 128): (4, 128, 64), + (1792, 256): (4, 128, 64), + (1792, 512): (4, 256, 64), + (1792, 768): (4, 256, 64), + (1792, 1024): (4, 512, 128), + (1792, 1536): (4, 512, 128), + (1792, 2048): (4, 1024, 128), + (1792, 3072): (4, 1024, 128), + (1792, 4096): (4, 1024, 128), + (1792, 6144): (4, 1024, 128), + (1792, 8192): (4, 2048, 128), + (1792, 12288): (4, 2048, 128), + (1792, 16384): (4, 2048, 128), + (1792, 24576): (4, 2048, 128), + (1792, 32768): (4, 1024, 128), + (1792, 40960): (4, 2048, 128), + (1792, 49152): (8, 2048, 64), + (1792, 57344): (8, 4096, 64), + (1792, 65536): (8, 4096, 64), + (1792, 73728): (8, 4096, 64), + (1792, 81920): (8, 4096, 64), + (1792, 90112): (8, 4096, 64), + (1792, 98304): (8, 4096, 64), + (1792, 106496): (8, 4096, 64), + (1792, 114688): (8, 8192, 64), + (1792, 122880): (8, 8192, 64), + (1792, 131072): (8, 8192, 64), + (2048, 64): (2, 64, 64), + (2048, 128): (4, 128, 64), + (2048, 256): (4, 128, 64), + (2048, 512): (4, 512, 128), + (2048, 768): (4, 256, 64), + (2048, 1024): (4, 512, 128), + (2048, 1536): (4, 512, 128), + (2048, 2048): (4, 1024, 128), + (2048, 3072): (4, 1024, 128), + (2048, 4096): (4, 1024, 128), + (2048, 6144): (4, 1024, 128), + (2048, 8192): (4, 1024, 128), + (2048, 12288): (4, 2048, 128), + (2048, 16384): (4, 2048, 128), + (2048, 24576): (4, 2048, 128), + (2048, 32768): (4, 2048, 128), + (2048, 40960): (4, 2048, 128), + (2048, 49152): (8, 4096, 64), + (2048, 57344): (8, 4096, 64), + (2048, 65536): (8, 4096, 64), + (2048, 73728): (8, 4096, 64), + (2048, 81920): (8, 4096, 64), + (2048, 90112): (8, 4096, 64), + (2048, 98304): (8, 8192, 64), + (2048, 106496): (8, 4096, 64), + (2048, 114688): (8, 8192, 64), + (2048, 122880): (8, 4096, 64), + (2048, 131072): (8, 8192, 64), + (2560, 64): (4, 64, 64), + (2560, 128): (4, 128, 64), + (2560, 256): (4, 256, 64), + (2560, 512): (4, 512, 128), + (2560, 768): (4, 256, 64), + (2560, 1024): (4, 512, 128), + (2560, 1536): (4, 512, 128), + (2560, 2048): (4, 1024, 128), + (2560, 3072): (4, 1024, 128), + (2560, 4096): (4, 1024, 128), + (2560, 6144): (4, 2048, 128), + (2560, 8192): (4, 2048, 128), + (2560, 12288): (4, 2048, 128), + (2560, 16384): (4, 2048, 128), + (2560, 24576): (4, 2048, 128), + (2560, 32768): (4, 1024, 128), + (2560, 40960): (4, 2048, 128), + (2560, 49152): (8, 4096, 64), + (2560, 57344): (8, 4096, 64), + (2560, 65536): (8, 4096, 64), + (2560, 73728): (8, 4096, 64), + (2560, 81920): (8, 4096, 64), + (2560, 90112): (8, 8192, 64), + (2560, 98304): (8, 4096, 64), + (2560, 106496): (8, 8192, 64), + (2560, 114688): (8, 4096, 64), + (2560, 122880): (8, 8192, 64), + (2560, 131072): (8, 4096, 64), + (3072, 64): (4, 64, 64), + (3072, 128): (4, 128, 64), + (3072, 256): (4, 256, 64), + (3072, 512): (4, 512, 128), + (3072, 768): (8, 256, 64), + (3072, 1024): (4, 1024, 128), + (3072, 1536): (4, 512, 128), + (3072, 2048): (4, 1024, 128), + (3072, 3072): (4, 1024, 128), + (3072, 4096): (4, 1024, 128), + (3072, 6144): (4, 2048, 128), + (3072, 8192): (4, 2048, 128), + (3072, 12288): (4, 2048, 128), + (3072, 16384): (4, 2048, 128), + (3072, 24576): (4, 2048, 128), + (3072, 32768): (4, 2048, 128), + (3072, 40960): (8, 2048, 64), + (3072, 49152): (8, 4096, 64), + (3072, 57344): (8, 4096, 64), + (3072, 65536): (8, 4096, 64), + (3072, 73728): (8, 8192, 64), + (3072, 81920): (8, 4096, 64), + (3072, 90112): (8, 8192, 64), + (3072, 98304): (8, 8192, 64), + (3072, 106496): (8, 4096, 64), + (3072, 114688): (8, 8192, 64), + (3072, 122880): (8, 8192, 64), + (3072, 131072): (8, 8192, 64), + (3584, 64): (4, 64, 64), + (3584, 128): (4, 128, 64), + (3584, 256): (4, 128, 64), + (3584, 512): (4, 512, 128), + (3584, 768): (4, 256, 64), + (3584, 1024): (4, 512, 128), + (3584, 1536): (4, 512, 128), + (3584, 2048): (4, 1024, 128), + (3584, 3072): (4, 1024, 128), + (3584, 4096): (4, 2048, 128), + (3584, 6144): (4, 2048, 128), + (3584, 8192): (4, 2048, 128), + (3584, 12288): (4, 2048, 128), + (3584, 16384): (4, 2048, 128), + (3584, 24576): (4, 2048, 128), + (3584, 32768): (4, 2048, 128), + (3584, 40960): (8, 4096, 64), + (3584, 49152): (8, 4096, 64), + (3584, 57344): (8, 4096, 64), + (3584, 65536): (8, 4096, 64), + (3584, 73728): (8, 8192, 64), + (3584, 81920): (8, 4096, 64), + (3584, 90112): (8, 4096, 64), + (3584, 98304): (8, 8192, 64), + (3584, 106496): (8, 8192, 64), + (3584, 114688): (8, 8192, 64), + (3584, 122880): (8, 8192, 64), + (3584, 131072): (8, 8192, 64), + (4096, 64): (4, 64, 64), + (4096, 128): (4, 128, 64), + (4096, 256): (4, 128, 64), + (4096, 512): (4, 512, 128), + (4096, 768): (8, 256, 64), + (4096, 1024): (4, 512, 128), + (4096, 1536): (4, 512, 128), + (4096, 2048): (4, 1024, 128), + (4096, 3072): (4, 1024, 128), + (4096, 4096): (4, 1024, 128), + (4096, 6144): (4, 2048, 128), + (4096, 8192): (4, 2048, 128), + (4096, 12288): (4, 2048, 128), + (4096, 16384): (4, 2048, 128), + (4096, 24576): (4, 2048, 128), + (4096, 32768): (4, 2048, 128), + (4096, 40960): (8, 4096, 64), + (4096, 49152): (8, 4096, 64), + (4096, 57344): (8, 4096, 64), + (4096, 65536): (8, 4096, 64), + (4096, 73728): (8, 8192, 64), + (4096, 81920): (8, 4096, 64), + (4096, 90112): (8, 4096, 64), + (4096, 98304): (8, 8192, 64), + (4096, 106496): (8, 8192, 64), + (4096, 114688): (8, 8192, 64), + (4096, 122880): (8, 8192, 64), + (4096, 131072): (8, 8192, 64), +} + + +def get_tuned_config(m: int, n: int) -> Tuple[int, int, int]: + """Lookup tuned config for (m, n) using floor. Returns (m_split, blk_n1, blk_n2).""" + if m <= M_REPR_TABLE[0]: + m_repr = M_REPR_TABLE[0] + elif m >= M_REPR_TABLE[-1]: + m_repr = M_REPR_TABLE[-1] + else: + idx = bisect.bisect_right(M_REPR_TABLE, m) - 1 + m_repr = M_REPR_TABLE[idx] + if n <= N_REPR_TABLE[0]: + n_repr = N_REPR_TABLE[0] + elif n >= N_REPR_TABLE[-1]: + n_repr = N_REPR_TABLE[-1] + else: + idx = bisect.bisect_right(N_REPR_TABLE, n) - 1 + n_repr = N_REPR_TABLE[idx] + return CONFIG_MAP[(m_repr, n_repr)] diff --git a/aiter/ops/tilelang/fp8_index.py b/aiter/ops/tilelang/fp8_index.py new file mode 100644 index 0000000000000000000000000000000000000000..60a31d26e0f73b172c5726435db99619f0db83ca --- /dev/null +++ b/aiter/ops/tilelang/fp8_index.py @@ -0,0 +1,552 @@ +from typing import Optional, Tuple +import functools + +import tilelang +import tilelang.language as T +import torch + +tilelang.set_log_level("WARNING") +cu_count = torch.cuda.get_device_properties("cuda").multi_processor_count + +pass_configs = { + tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, + tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, + # tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True, + tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True, + tilelang.PassConfigKey.TL_DISABLE_DATA_RACE_CHECK: True, +} + +BF16 = "bfloat16" +FP8 = "float8_e4m3" +FP32 = "float32" + +def fast_log2_ceil(x): + bits_x = T.reinterpret("uint32", x) + exp_x = (bits_x >> 23) & 0xFF + man_bits = bits_x & ((1 << 23) - 1) + return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0)) + + +def fast_pow2(x): + bits_x = (x + 127) << 23 + return T.reinterpret("float32", bits_x) + + +def fast_round_scale(amax, fp8_max_inv): + return fast_pow2(fast_log2_ceil(amax * fp8_max_inv)) + + +@tilelang.jit(pass_configs=pass_configs) +def act_quant_kernel( + N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False +): + M = T.symbolic("M") + fp8_min = -448.0 + fp8_max = 448.0 + fp8_max_inv = 1 / fp8_max + num_stages = 0 if round_scale else 2 + blk_m = 32 + group_size = 128 + + @T.prim_func + def act_quant_kernel_( + X: T.Tensor[(M, N), in_dtype], + Y: T.Tensor[(M, N), out_dtype], + S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype], + ): + with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as ( + pid_m, + pid_n, + ): + x_shared = T.alloc_shared((blk_m, group_size), in_dtype) + x_local = T.alloc_fragment((blk_m, group_size), in_dtype) + amax_local = T.alloc_fragment((blk_m,), scale_dtype) + s_local = T.alloc_fragment((blk_m,), scale_dtype) + y_local = T.alloc_fragment((blk_m, group_size), out_dtype) + y_shared = T.alloc_shared((blk_m, group_size), out_dtype) + + for _ in T.Pipelined(1, num_stages=num_stages): + T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared) + T.copy(x_shared, x_local) + T.reduce_absmax(x_local, amax_local, dim=1) + for i in T.Parallel(blk_m): + amax_local[i] = T.max(amax_local[i], 1e-4) + if round_scale: + s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv) + else: + s_local[i] = amax_local[i] * fp8_max_inv + for i, j in T.Parallel(blk_m, group_size): + y_local[i, j] = T.clamp( + x_local[i, j] / s_local[i], fp8_min, fp8_max + ) + for i in T.Parallel(blk_m): + S[pid_m * blk_m + i, pid_n] = s_local[i] + T.copy(y_local, y_shared) + T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size]) + + return act_quant_kernel_ + + +def act_quant( + x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Quantizes the input tensor `x` using block-wise quantization. + + Args: + x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`. + block_size (int, optional): The size of the blocks to be used for quantization. Default is 128. + scale_fmt (Optional[str], optional): The format of the scale. Default is None. + Returns: + Tuple[torch.Tensor, torch.Tensor]: A tuple containing: + - The quantized tensor with dtype `torch.float8_e4m3fn`. + - A tensor of scaling factors with dtype `torch.float32`. + """ + assert x.is_contiguous(), "Input tensor must be contiguous" + assert ( + x.size(-1) % block_size == 0 + ), f"Last dimension size must be divisible by block_size (block_size={block_size})" + N = x.size(-1) + y = torch.empty_like(x, dtype=torch.float8_e4m3fn) + s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32) + kernel = act_quant_kernel(N, round_scale=scale_fmt is not None) + kernel(x.view(-1, N), y.view(-1, N), s.view(-1, N // block_size)) + return y, s + +@tilelang.jit(out_idx=[4], pass_configs=pass_configs) +def fp8_index_kernel( + h: int, d: int, m_split: int, blk_n1: int, blk_n2: int, disable_buffer_ops: bool = False, threads: int = 256, clear_accum: bool = True +): + b, m, n = T.symbolic("b"), T.symbolic("m"), T.symbolic("n") + # if m_split * h > 128, use Square policy to avoid register spill + gemm_policy = T.GemmWarpPolicy.FullRow if m_split * h <= 128 else T.GemmWarpPolicy.Square + + @T.prim_func + def fp8_index_kernel_( + q: T.Tensor[(b, m, h, d), FP8], + q_s: T.Tensor[(b, m, h), FP32], + k: T.Tensor[(b, n, d), FP8], + k_s: T.Tensor[(b, n), FP32], + o: T.Tensor[(b, m, n), FP32], + ) -> None: + with T.Kernel(b, T.ceildiv(n, blk_n1), m, threads=threads) as ( + i_b, i1_n, i_m_block + ): + if disable_buffer_ops: + T.disable_buffer_ops(o) + m_start = i_m_block + q_smem = T.alloc_shared((h, d), FP8) + k_smem = T.alloc_shared((blk_n2, d), FP8) + T.annotate_layout({ + q_smem: tilelang.layout.make_hcu_swizzled_layout(q_smem, major_pack=1), + k_smem: tilelang.layout.make_hcu_swizzled_layout(k_smem, major_pack=1), + }) + q_frag = T.alloc_fragment((h, d), FP8) + q_s_frag = T.alloc_fragment(h, FP32) + k_frag = T.alloc_fragment((blk_n2, d), FP8) + k_s_frag = T.alloc_fragment(blk_n2, FP32) + logits = T.alloc_fragment((blk_n2, h), FP32) + logits_sum = T.alloc_fragment(blk_n2, FP32) + T.copy(q[i_b, m_start, 0, 0], q_smem) + T.copy(q_smem, q_frag) + T.copy(q_s[i_b, m_start, 0], q_s_frag) + for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=0): + T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem) + T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag) + T.clear(logits) + T.copy(k_smem, k_frag) + T.gemm(k_frag, q_frag, logits, transpose_A=False, transpose_B=True, policy=gemm_policy) + + for i_h, i3_n in T.Parallel(h, blk_n2): + logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h] + T.reduce_sum(logits, logits_sum, dim=1) + for i3_n in T.Parallel(blk_n2): + logits_sum[i3_n] *= k_s_frag[i3_n] + T.copy(logits_sum, o[i_b, m_start, i1_n * blk_n1 + i2_n * blk_n2]) + + @T.prim_func + def fp8_index_kernel_1( + q: T.Tensor[(b, m, h, d), FP8], + q_s: T.Tensor[(b, m, h), FP32], + k: T.Tensor[(b, n, d), FP8], + k_s: T.Tensor[(b, n), FP32], + o: T.Tensor[(b, m, n), FP32], + ) -> None: + with T.Kernel(b, T.ceildiv(n, blk_n1), T.ceildiv(m, 2), threads=threads) as ( + i_b, i1_n, i_m_block + ): + if disable_buffer_ops: + T.disable_buffer_ops(o) + m_start = i_m_block * 2 + q_smem0 = T.alloc_shared((h, d), FP8) + q_smem1 = T.alloc_shared((h, d), FP8) + k_smem = T.alloc_shared((blk_n2, d), FP8) + T.annotate_layout({ + q_smem0: tilelang.layout.make_hcu_swizzled_layout(q_smem0, major_pack=1), + q_smem1: tilelang.layout.make_hcu_swizzled_layout(q_smem1, major_pack=1), + k_smem: tilelang.layout.make_hcu_swizzled_layout(k_smem, major_pack=1), + }) + q_frag0 = T.alloc_fragment((h, d), FP8) + q_frag1 = T.alloc_fragment((h, d), FP8) + q_s_frag0 = T.alloc_fragment(h, FP32) + q_s_frag1 = T.alloc_fragment(h, FP32) + k_frag = T.alloc_fragment((blk_n2, d), FP8) + k_s_frag = T.alloc_fragment(blk_n2, FP32) + logits0 = T.alloc_fragment((blk_n2, h), FP32) + logits1 = T.alloc_fragment((blk_n2, h), FP32) + logits_sum0 = T.alloc_fragment(blk_n2, FP32) + logits_sum1 = T.alloc_fragment(blk_n2, FP32) + T.copy(q[i_b, m_start, 0, 0], q_smem0) + T.copy(q[i_b, m_start + 1, 0, 0], q_smem1) + T.copy(q_smem0, q_frag0) + T.copy(q_smem1, q_frag1) + T.copy(q_s[i_b, m_start, 0], q_s_frag0) + T.copy(q_s[i_b, m_start + 1, 0], q_s_frag1) + for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=0): + T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem) + T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag) + T.clear(logits0) + T.clear(logits1) + T.copy(k_smem, k_frag) + T.gemm(k_frag, q_frag0, logits0, transpose_A=False, transpose_B=True, policy=gemm_policy) + T.gemm(k_frag, q_frag1, logits1, transpose_A=False, transpose_B=True, policy=gemm_policy) + + for i_h, i3_n in T.Parallel(h, blk_n2): + logits0[i3_n, i_h] = T.max(logits0[i3_n, i_h], 0) * q_s_frag0[i_h] + logits1[i3_n, i_h] = T.max(logits1[i3_n, i_h], 0) * q_s_frag1[i_h] + T.reduce_sum(logits0, logits_sum0, dim=1) + T.reduce_sum(logits1, logits_sum1, dim=1) + for i3_n in T.Parallel(blk_n2): + logits_sum0[i3_n] *= k_s_frag[i3_n] + logits_sum1[i3_n] *= k_s_frag[i3_n] + T.copy(logits_sum0, o[i_b, m_start, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum1, o[i_b, m_start + 1, i1_n * blk_n1 + i2_n * blk_n2]) + + @T.prim_func + def fp8_index_kernel_2( + q: T.Tensor[(b, m, h, d), FP8], + q_s: T.Tensor[(b, m, h), FP32], + k: T.Tensor[(b, n, d), FP8], + k_s: T.Tensor[(b, n), FP32], + o: T.Tensor[(b, m, n), FP32], + ) -> None: + with T.Kernel(b, T.ceildiv(n, blk_n1), T.ceildiv(m, 4), threads=threads) as ( + i_b, i1_n, i_m_block + ): + if disable_buffer_ops: + T.disable_buffer_ops(o) + m_start = i_m_block * 4 + q_smem0 = T.alloc_shared((h, d), FP8) + q_smem1 = T.alloc_shared((h, d), FP8) + q_smem2 = T.alloc_shared((h, d), FP8) + q_smem3 = T.alloc_shared((h, d), FP8) + k_smem = T.alloc_shared((blk_n2, d), FP8) + T.annotate_layout({ + q_smem0: tilelang.layout.make_hcu_swizzled_layout(q_smem0, major_pack=1), + q_smem1: tilelang.layout.make_hcu_swizzled_layout(q_smem1, major_pack=1), + q_smem2: tilelang.layout.make_hcu_swizzled_layout(q_smem2, major_pack=1), + q_smem3: tilelang.layout.make_hcu_swizzled_layout(q_smem3, major_pack=1), + k_smem: tilelang.layout.make_hcu_swizzled_layout(k_smem, major_pack=1), + }) + q_frag0 = T.alloc_fragment((h, d), FP8) + q_frag1 = T.alloc_fragment((h, d), FP8) + q_frag2 = T.alloc_fragment((h, d), FP8) + q_frag3 = T.alloc_fragment((h, d), FP8) + q_s_frag0 = T.alloc_fragment(h, FP32) + q_s_frag1 = T.alloc_fragment(h, FP32) + q_s_frag2 = T.alloc_fragment(h, FP32) + q_s_frag3 = T.alloc_fragment(h, FP32) + T.copy(q[i_b, m_start, 0, 0], q_smem0) + T.copy(q[i_b, m_start + 1, 0, 0], q_smem1) + T.copy(q[i_b, m_start + 2, 0, 0], q_smem2) + T.copy(q[i_b, m_start + 3, 0, 0], q_smem3) + T.copy(q_smem0, q_frag0) + T.copy(q_smem1, q_frag1) + T.copy(q_smem2, q_frag2) + T.copy(q_smem3, q_frag3) + T.copy(q_s[i_b, m_start, 0], q_s_frag0) + T.copy(q_s[i_b, m_start + 1, 0], q_s_frag1) + T.copy(q_s[i_b, m_start + 2, 0], q_s_frag2) + T.copy(q_s[i_b, m_start + 3, 0], q_s_frag3) + k_frag = T.alloc_fragment((blk_n2, d), FP8) + k_s_frag = T.alloc_fragment(blk_n2, FP32) + logits0 = T.alloc_fragment((blk_n2, h), FP32) + logits1 = T.alloc_fragment((blk_n2, h), FP32) + logits2 = T.alloc_fragment((blk_n2, h), FP32) + logits3 = T.alloc_fragment((blk_n2, h), FP32) + logits_sum0 = T.alloc_fragment(blk_n2, FP32) + logits_sum1 = T.alloc_fragment(blk_n2, FP32) + logits_sum2 = T.alloc_fragment(blk_n2, FP32) + logits_sum3 = T.alloc_fragment(blk_n2, FP32) + for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=0): + T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem) + T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag) + T.clear(logits0) + T.clear(logits1) + T.clear(logits2) + T.clear(logits3) + T.copy(k_smem, k_frag) + T.gemm(k_frag, q_frag0, logits0, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag1, logits1, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag2, logits2, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag3, logits3, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + + for i_h, i3_n in T.Parallel(h, blk_n2): + logits0[i3_n, i_h] = T.max(logits0[i3_n, i_h], 0) * q_s_frag0[i_h] + logits1[i3_n, i_h] = T.max(logits1[i3_n, i_h], 0) * q_s_frag1[i_h] + logits2[i3_n, i_h] = T.max(logits2[i3_n, i_h], 0) * q_s_frag2[i_h] + logits3[i3_n, i_h] = T.max(logits3[i3_n, i_h], 0) * q_s_frag3[i_h] + T.reduce_sum(logits0, logits_sum0, dim=1) + T.reduce_sum(logits1, logits_sum1, dim=1) + T.reduce_sum(logits2, logits_sum2, dim=1) + T.reduce_sum(logits3, logits_sum3, dim=1) + for i3_n in T.Parallel(blk_n2): + logits_sum0[i3_n] *= k_s_frag[i3_n] + logits_sum1[i3_n] *= k_s_frag[i3_n] + logits_sum2[i3_n] *= k_s_frag[i3_n] + logits_sum3[i3_n] *= k_s_frag[i3_n] + T.copy(logits_sum0, o[i_b, m_start, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum1, o[i_b, m_start + 1, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum2, o[i_b, m_start + 2, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum3, o[i_b, m_start + 3, i1_n * blk_n1 + i2_n * blk_n2]) + + @T.prim_func + def fp8_index_kernel_3( + q: T.Tensor[(b, m, h, d), FP8], + q_s: T.Tensor[(b, m, h), FP32], + k: T.Tensor[(b, n, d), FP8], + k_s: T.Tensor[(b, n), FP32], + o: T.Tensor[(b, m, n), FP32], + ) -> None: + with T.Kernel(b, T.ceildiv(n, blk_n1), T.ceildiv(m, 8), threads=threads) as ( + i_b, i1_n, i_m_block + ): + if disable_buffer_ops: + T.disable_buffer_ops(o) + m_start = i_m_block * 8 + q_smem0 = T.alloc_shared((h, d), FP8) + q_smem1 = T.alloc_shared((h, d), FP8) + q_smem2 = T.alloc_shared((h, d), FP8) + q_smem3 = T.alloc_shared((h, d), FP8) + k_smem = T.alloc_shared((blk_n2, d), FP8) + T.annotate_layout({ + q_smem0: tilelang.layout.make_hcu_swizzled_layout(q_smem0, major_pack=1), + q_smem1: tilelang.layout.make_hcu_swizzled_layout(q_smem1, major_pack=1), + q_smem2: tilelang.layout.make_hcu_swizzled_layout(q_smem2, major_pack=1), + q_smem3: tilelang.layout.make_hcu_swizzled_layout(q_smem3, major_pack=1), + k_smem: tilelang.layout.make_hcu_swizzled_layout(k_smem, major_pack=1), + }) + q_pre_frag0 = T.alloc_fragment((h, d), FP8) + q_pre_frag1 = T.alloc_fragment((h, d), FP8) + q_pre_frag2 = T.alloc_fragment((h, d), FP8) + q_pre_frag3 = T.alloc_fragment((h, d), FP8) + q_pre_frag4 = T.alloc_fragment((h, d), FP8) + q_pre_frag5 = T.alloc_fragment((h, d), FP8) + q_pre_frag6 = T.alloc_fragment((h, d), FP8) + q_pre_frag7 = T.alloc_fragment((h, d), FP8) + q_frag0 = T.alloc_fragment((h, d), FP8) + q_frag1 = T.alloc_fragment((h, d), FP8) + q_frag2 = T.alloc_fragment((h, d), FP8) + q_frag3 = T.alloc_fragment((h, d), FP8) + q_frag4 = T.alloc_fragment((h, d), FP8) + q_frag5 = T.alloc_fragment((h, d), FP8) + q_frag6 = T.alloc_fragment((h, d), FP8) + q_frag7 = T.alloc_fragment((h, d), FP8) + q_s_frag0 = T.alloc_fragment(h, FP32) + q_s_frag1 = T.alloc_fragment(h, FP32) + q_s_frag2 = T.alloc_fragment(h, FP32) + q_s_frag3 = T.alloc_fragment(h, FP32) + q_s_frag4 = T.alloc_fragment(h, FP32) + q_s_frag5 = T.alloc_fragment(h, FP32) + q_s_frag6 = T.alloc_fragment(h, FP32) + q_s_frag7 = T.alloc_fragment(h, FP32) + k_frag = T.alloc_fragment((blk_n2, d), FP8) + k_s_frag = T.alloc_fragment(blk_n2, FP32) + logits0 = T.alloc_fragment((blk_n2, h), FP32) + logits1 = T.alloc_fragment((blk_n2, h), FP32) + logits2 = T.alloc_fragment((blk_n2, h), FP32) + logits3 = T.alloc_fragment((blk_n2, h), FP32) + logits4 = T.alloc_fragment((blk_n2, h), FP32) + logits5 = T.alloc_fragment((blk_n2, h), FP32) + logits6 = T.alloc_fragment((blk_n2, h), FP32) + logits7 = T.alloc_fragment((blk_n2, h), FP32) + logits_sum0 = T.alloc_fragment(blk_n2, FP32) + logits_sum1 = T.alloc_fragment(blk_n2, FP32) + logits_sum2 = T.alloc_fragment(blk_n2, FP32) + logits_sum3 = T.alloc_fragment(blk_n2, FP32) + logits_sum4 = T.alloc_fragment(blk_n2, FP32) + logits_sum5 = T.alloc_fragment(blk_n2, FP32) + logits_sum6 = T.alloc_fragment(blk_n2, FP32) + logits_sum7 = T.alloc_fragment(blk_n2, FP32) + + T.copy(q[i_b, m_start, 0, 0], q_pre_frag0) + T.copy(q[i_b, m_start + 1, 0, 0], q_pre_frag1) + T.copy(q[i_b, m_start + 2, 0, 0], q_pre_frag2) + T.copy(q[i_b, m_start + 3, 0, 0], q_pre_frag3) + T.copy(q[i_b, m_start + 4, 0, 0], q_pre_frag4) + T.copy(q[i_b, m_start + 5, 0, 0], q_pre_frag5) + T.copy(q[i_b, m_start + 6, 0, 0], q_pre_frag6) + T.copy(q[i_b, m_start + 7, 0, 0], q_pre_frag7) + T.copy(q_s[i_b, m_start, 0], q_s_frag0) + T.copy(q_s[i_b, m_start + 1, 0], q_s_frag1) + T.copy(q_s[i_b, m_start + 2, 0], q_s_frag2) + T.copy(q_s[i_b, m_start + 3, 0], q_s_frag3) + T.copy(q_s[i_b, m_start + 4, 0], q_s_frag4) + T.copy(q_s[i_b, m_start + 5, 0], q_s_frag5) + T.copy(q_s[i_b, m_start + 6, 0], q_s_frag6) + T.copy(q_s[i_b, m_start + 7, 0], q_s_frag7) + + T.copy(q_pre_frag0, q_smem0) + T.copy(q_pre_frag1, q_smem1) + T.copy(q_pre_frag2, q_smem2) + T.copy(q_pre_frag3, q_smem3) + T.copy(q_smem0, q_frag0) + T.copy(q_smem1, q_frag1) + T.copy(q_smem2, q_frag2) + T.copy(q_smem3, q_frag3) + T.copy(q_pre_frag4, q_smem0) + T.copy(q_pre_frag5, q_smem1) + T.copy(q_pre_frag6, q_smem2) + T.copy(q_pre_frag7, q_smem3) + T.copy(q_smem0, q_frag4) + T.copy(q_smem1, q_frag5) + T.copy(q_smem2, q_frag6) + T.copy(q_smem3, q_frag7) + + for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=0): + T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem) + T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag) + T.clear(logits0) + T.clear(logits1) + T.clear(logits2) + T.clear(logits3) + T.clear(logits4) + T.clear(logits5) + T.clear(logits6) + T.clear(logits7) + T.copy(k_smem, k_frag) + T.gemm(k_frag, q_frag0, logits0, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag1, logits1, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag2, logits2, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag3, logits3, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag4, logits4, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag5, logits5, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag6, logits6, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + T.gemm(k_frag, q_frag7, logits7, transpose_A=False, transpose_B=True, k_pack=1, policy=gemm_policy) + for i_h, i3_n in T.Parallel(h, blk_n2): + logits0[i3_n, i_h] = T.max(logits0[i3_n, i_h], 0) * q_s_frag0[i_h] + logits1[i3_n, i_h] = T.max(logits1[i3_n, i_h], 0) * q_s_frag1[i_h] + logits2[i3_n, i_h] = T.max(logits2[i3_n, i_h], 0) * q_s_frag2[i_h] + logits3[i3_n, i_h] = T.max(logits3[i3_n, i_h], 0) * q_s_frag3[i_h] + logits4[i3_n, i_h] = T.max(logits4[i3_n, i_h], 0) * q_s_frag4[i_h] + logits5[i3_n, i_h] = T.max(logits5[i3_n, i_h], 0) * q_s_frag5[i_h] + logits6[i3_n, i_h] = T.max(logits6[i3_n, i_h], 0) * q_s_frag6[i_h] + logits7[i3_n, i_h] = T.max(logits7[i3_n, i_h], 0) * q_s_frag7[i_h] + + T.reduce_sum(logits0, logits_sum0, dim=1) + T.reduce_sum(logits1, logits_sum1, dim=1) + T.reduce_sum(logits2, logits_sum2, dim=1) + T.reduce_sum(logits3, logits_sum3, dim=1) + T.reduce_sum(logits4, logits_sum4, dim=1) + T.reduce_sum(logits5, logits_sum5, dim=1) + T.reduce_sum(logits6, logits_sum6, dim=1) + T.reduce_sum(logits7, logits_sum7, dim=1) + for i3_n in T.Parallel(blk_n2): + logits_sum0[i3_n] *= k_s_frag[i3_n] + logits_sum1[i3_n] *= k_s_frag[i3_n] + logits_sum2[i3_n] *= k_s_frag[i3_n] + logits_sum3[i3_n] *= k_s_frag[i3_n] + logits_sum4[i3_n] *= k_s_frag[i3_n] + logits_sum5[i3_n] *= k_s_frag[i3_n] + logits_sum6[i3_n] *= k_s_frag[i3_n] + logits_sum7[i3_n] *= k_s_frag[i3_n] + + T.copy(logits_sum0, o[i_b, m_start, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum1, o[i_b, m_start + 1, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum2, o[i_b, m_start + 2, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum3, o[i_b, m_start + 3, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum4, o[i_b, m_start + 4, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum5, o[i_b, m_start + 5, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum6, o[i_b, m_start + 6, i1_n * blk_n1 + i2_n * blk_n2]) + T.copy(logits_sum7, o[i_b, m_start + 7, i1_n * blk_n1 + i2_n * blk_n2]) + + if m_split == 1: + return fp8_index_kernel_ + elif m_split == 2: + return fp8_index_kernel_1 + elif m_split == 4: + return fp8_index_kernel_2 + else: + return fp8_index_kernel_3 + + +@functools.lru_cache(maxsize=64) +def _get_config_module(h: int, d: int, cu_count: int): + """Get config module for (h, d, cu_count). Returns None if not found.""" + config_module_name = f"fp8_index_tuned_config_h{h}_d{d}_cu{cu_count}" + try: + return __import__(f"aiter.ops.tilelang.configs.fp8_index.{config_module_name}", fromlist=["get_tuned_config"]) + except (ImportError, AttributeError): + return None + + +@functools.lru_cache(maxsize=128) +def _get_fp8_index_kernel( + h: int, + d: int, + m_split: int, + blk_n1: int, + blk_n2: int, + disable_buffer_ops: bool = False, + threads: int = 256, + clear_accum: bool = True, +): + """Cached kernel creation. Dispatches to fp8_index_kernel_ / _1 / _2 based on m_split.""" + assert m_split in (1, 2, 4, 8), "m_split must be 1, 2, 4, or 8" + print( + f"[fp8_index] kernel config: h={h} d={d} m_split={m_split} blk_n1={blk_n1} blk_n2={blk_n2} " + f"threads={threads} clear_accum={clear_accum} disable_buffer_ops={disable_buffer_ops}" + ) + return fp8_index_kernel( + h, d, m_split, blk_n1, blk_n2, disable_buffer_ops, threads=threads, clear_accum=clear_accum + ) + + +def fp8_index( + q: torch.Tensor, + q_s: torch.Tensor, + k: torch.Tensor, + k_s: torch.Tensor, +) -> torch.Tensor: + """ + Perform index score using FP8 precision. + + Args: + q (torch.Tensor): The Q tensor, must be contiguous. + q_s (torch.Tensor): The scaling factor for Q (float), must be contiguous. + k (torch.Tensor): The K tensor, must be contiguous. + k_s (torch.Tensor): The scaling factor for K (e8m0 here), must be contiguous. + + fp8 q @ fp8 k -> fp32 logits + relu(fp32 logits) * q_s (weights) -> fp32 logits + fp32 logits -> fp32 logits_sum + fp32 logits_sum * k_s (e8m0) -> fp32 index_score + """ + b, m, h, d = q.shape + n = k.shape[1] + + # Use tuned config; fallback to default if not found (run tune_fp8_index.py to generate) + mod = _get_config_module(h, d, cu_count) + if mod is not None: + m_split, blk_n1, blk_n2 = mod.get_tuned_config(m, n) + else: + m_split, blk_n1, blk_n2 = 1, 512, 128 + + disable_buffer_ops = False + if b * m * n * 4 >= 4294967296: + disable_buffer_ops = True + + kernel = _get_fp8_index_kernel( + h, d, m_split, blk_n1, blk_n2, disable_buffer_ops, threads=256, clear_accum=False + ) + return kernel(q, q_s, k, k_s) diff --git a/aiter/ops/tilelang/sparse_mla_fwd.py b/aiter/ops/tilelang/sparse_mla_fwd.py new file mode 100644 index 0000000000000000000000000000000000000000..f54a51006b4ab5b78b0058835676335e67523d09 --- /dev/null +++ b/aiter/ops/tilelang/sparse_mla_fwd.py @@ -0,0 +1,1546 @@ +# ruff: noqa +import torch +import tilelang +from tilelang import language as T +# from utils import assert_tensors_similar +import functools +import math +from aiter import logger + +cu_count = torch.cuda.get_device_properties("cuda").multi_processor_count +# def get_configs(): +# import itertools +# block_I = [16, 32] +# threads = [128, 256] +# num_split = [1, 2, 4, 8, 16] +# num_stages = [1] + +# _configs = list(itertools.product(block_I, threads, num_split, num_stages)) + +# return [{ +# "block_I": c[0], +# "threads": c[1], +# "num_split": c[2], +# "num_stages": c[3], +# } for c in _configs] + +# @tilelang.autotune(configs=get_configs()) +config_map_cu72 = { + 1: { + "block_I": 32, + "threads": 256, + "num_split": 32, + "num_stages": 0, + "batch_head": 1, + "num_split_tail": 0, + }, + 2: { + "block_I": 32, + "threads": 256, + "num_split": 32, + "num_stages": 0, + "batch_head": 2, + "num_split_tail": 0, + }, + 3: { + "block_I": 32, + "threads": 256, + "num_split": 32, + "num_stages": 0, + "batch_head": 3, + "num_split_tail": 0, + }, + 4: { + "block_I": 32, + "threads": 256, + "num_split": 32, + "num_stages": 0, + "batch_head": 4, + "num_split_tail": 0, + }, + 8: { + "block_I": 32, + "threads": 256, + "num_split": 16, + "num_stages": 0, + "batch_head": 8, + "num_split_tail": 0, + }, + 16: { + "block_I": 32, + "threads": 256, + "num_split": 8, + "num_stages": 0, + "batch_head": 16, + "num_split_tail": 0, + }, + 32: { + "block_I": 32, + "threads": 256, + "num_split": 4, + "num_stages": 0, + "batch_head": 32, + "num_split_tail": 0, + }, + 64: { + "block_I": 32, + "threads": 256, + "num_split": 2, + "num_stages": 0, + "batch_head": 64, + "num_split_tail": 0, + }, + 128: { + "block_I": 32, + "threads": 256, + "num_split": 1, + "num_stages": 0, + "batch_head": 128, + "num_split_tail": 0, + } +} + +@tilelang.jit( + # if we set output idx, it will cuase error when create output tensor in cython wrapper when cuda_graph is used + # out_idx=[-1], + pass_configs={ + tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, + tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, + tilelang.PassConfigKey.TL_DISABLE_SAFE_MEMORY_ACCESS: True, + tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True, + tilelang.PassConfigKey.TL_DISABLE_DATA_RACE_CHECK: True, + }, +) +def sparse_mla_fwd( + num_heads, + dim, + tail_dim, + topk, + num_split=1, + num_split_tail=0, + *, + kv_group=1, + sm_scale=None, + is_causal=True, + block_I=32, + num_stages=1, + threads=256, + kv_stride=1, + dtype="bfloat16", +): + assert dim == tilelang.math.next_power_of_2( + dim + ), f"haven't check padding correctness yet, dim={dim}" + assert tail_dim == tilelang.math.next_power_of_2( + tail_dim + ), f"haven't check padding correctness yet, dim={tail_dim}" + assert is_causal == True, "non-casual is not supported" + assert ( + topk % block_I == 0 + ), "otherwise will load some index=0 thus causing wrong kv to be loaded" + if num_split > 1: + assert ( + topk % (num_split * block_I) == 0 + ), f"topk={topk} must be divisible by num_split * block_I={num_split} * {block_I}" + if num_split_tail > 0: + assert ( + topk % (num_split_tail * block_I) == 0 + ), f"topk={topk} must be divisible by num_split_tail * block_I={num_split_tail} * {block_I}" + if sm_scale is None: + sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504 # log2(e) + else: + sm_scale = sm_scale * 1.44269504 # log2(e) + + dim_plus_tail = dim + tail_dim + head_kv = num_heads // kv_group + + # Symbolic variables + batch = T.symbolic("batch") + batch_head = T.symbolic("batch_head") + batch_tail = T.symbolic("batch_tail") + seq_len = T.symbolic("seq_len") + seq_len_kv = T.symbolic("seq_len_kv") + + q_shape = [batch, seq_len, num_heads, dim_plus_tail] + kv_shape = [batch, seq_len_kv, kv_group, dim_plus_tail] + o_shape = [batch, seq_len, num_heads, dim] + indices_shape = [batch, seq_len, kv_group, topk] + + glse_shape = [batch_head, seq_len, num_split, num_heads] + output_partial_shape = [batch_head, seq_len, num_split, num_heads, dim] + if num_split_tail > 0: + glse_shape_tail = [batch_tail, seq_len, num_split_tail, num_heads] + output_partial_shape_tail = [batch_tail, seq_len, num_split_tail, num_heads, dim] + + indices_dtype = "int32" + accum_dtype = "float" + intermediate_dtype = "float16" + # intermediate_dtype = dtype + + H = head_kv + padded_H = max(tilelang.math.next_power_of_2(head_kv), 16) + if padded_H != H: + assert kv_group == 1 + BI = block_I + + # e.g. num_split = 2, we take like BI_split0 BI_split1 BI_split0 BI_split1 ... on topk + topk_per_split = topk // num_split if num_split > 1 else topk + NI = tilelang.cdiv(topk_per_split, BI) + split_stride = BI * num_split + + topk_per_split_tail = topk // num_split_tail if num_split_tail > 1 else topk + NI_tail = tilelang.cdiv(topk_per_split_tail, BI) + split_stride_tail = BI * num_split_tail + + D = dim + D_spilt = dim // 4 + D_tail = tail_dim + + # Optimized: simplify max_block_m calculation + max_block_m = 32 if head_kv == 128 else 16 + + if head_kv > max_block_m: + assert head_kv % max_block_m == 0, f"head_kv should be a multiple of {max_block_m}" + REPLICATE_H = head_kv // max_block_m + else: + REPLICATE_H = 1 + + H_per_block = padded_H if REPLICATE_H == 1 else max_block_m + + hd_div_threads = (H_per_block * D_spilt) // threads + bid_div_threads = (BI * D_spilt) // threads + kv_vectorized = max(min(min(hd_div_threads, bid_div_threads), 8), 1) + # if BI is 16, can not kpack for gemm2 + if BI < 32 and kv_vectorized == 8: + kv_vectorized = 4 + + threads_per_line = D_spilt // kv_vectorized + warps_line_stride = threads // threads_per_line + kv_serial_count = BI // warps_line_stride + kpack = min((kv_vectorized + 3) // 4, 2) + + mmac_k = 16 * kpack + warps = threads // 64 + max_warp_k = D_spilt // mmac_k + max_warp_n = BI // 16 + max_warp_m = H_per_block // 16 + out_shared_reuse = H_per_block > 16 + + gemm1_policy = T.GemmWarpPolicy.FullColK + gemm2_policy = T.GemmWarpPolicy.FullCol + + # print(f"kv_serial_count={kv_serial_count}, warps_line_stride={warps_line_stride}, " + # f"threads_per_line={threads_per_line}, kv_vectorized={kv_vectorized}, kpack={kpack}, " + # f"block_M={H_per_block}, block_I={BI}, gemm1_policy={gemm1_policy}, gemm2_policy={gemm2_policy}") + + @T.macro + def sparse_mla( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + Output: T.Tensor(o_shape, dtype), # type: ignore + ): + with T.Kernel(seq_len * REPLICATE_H, batch, kv_group, threads=threads) as ( + bx, + by, + bz, + ): + Q_spilt0_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt1_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt2_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt3_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_tail_shared = T.alloc_fragment([H_per_block, D_tail], dtype) + + KV_spilt0_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt1_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt2_shared = T.alloc_shared([BI, D_spilt], dtype) + # KV_spilt3_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt0_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt1_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt2_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt3_local = T.alloc_fragment([BI, D_spilt], dtype) + + gemm2_kv_split0_local = T.alloc_fragment([BI, D_spilt], dtype) + K_tail_local = T.alloc_fragment([BI, D_tail], dtype) + + mask = T.alloc_fragment([BI], "bool") + + acc_o0 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o1 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o2 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o3 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + + acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype) + S_shared = T.alloc_shared([H_per_block, BI], dtype) + sumexp = T.alloc_fragment([H_per_block], accum_dtype) + sumexp_i = T.alloc_fragment([H_per_block], accum_dtype) + alpha = T.alloc_fragment([H_per_block], accum_dtype) + m_i = T.alloc_fragment([H_per_block], accum_dtype) + m_i_prev = T.alloc_fragment([H_per_block], accum_dtype) + indices_local = T.alloc_fragment([1], indices_dtype) + indices_mask = T.alloc_fragment([BI], indices_dtype) + indices_local_1 = T.alloc_fragment([1], indices_dtype) + indices_tail = T.alloc_fragment([1], indices_dtype) + valid_NI = T.alloc_fragment([1], "int") + + b_i, g_i = by, bz + s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H) + # q_i = q_start_index_s[0] + s_i + # max_kv_i = (q_i + 1 - kv_stride) // kv_stride + # kv_i = (q_i + 1 - kv_stride) // kv_stride + # max_kv_i = kv_i if (kv_i <= seq_len_kv - 1) else seq_len_kv - 1 + + H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * max_block_m) + H1 = H0 + H_per_block + + tx = T.get_thread_binding() + T.copy(Q[b_i, s_i, H0:H1, :D_spilt], Q_spilt0_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D_spilt:2*D_spilt], Q_spilt1_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt], Q_spilt2_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt], Q_spilt3_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared) + + T.fill(acc_o0, 0) + T.fill(acc_o1, 0) + T.fill(acc_o2, 0) + T.fill(acc_o3, 0) + T.fill(sumexp, 1) + T.fill(m_i, -(2**30)) # avoid -inf - inf to cause nan + T.fill(valid_NI, 0) + + for i_i in T.serial(NI): + first_indices = Indices[b_i, s_i, g_i, i_i * BI] + # if first_indices <= max_kv_i and first_indices >= 0: + if first_indices >= 0: + valid_NI[0] += 1 + + for i_i in T.Pipelined(valid_NI[0], num_stages=num_stages): + for bi_i in T.Parallel(BI): + indices_mask[bi_i] = Indices[b_i, s_i, g_i, i_i * BI + bi_i] + mask[bi_i] = indices_mask[bi_i] >= 0 + + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.if_then_else( + mask[bi_i], 0, -T.infinity(acc_s.dtype) + ) + + # for u in T.serial(kv_serial_count): + # line_stride = u * warps_line_stride + # indices_local[0] = Indices[b_i, s_i, g_i, i_i * BI + line_stride + tx // threads_per_line] + # # indices_local[0] = T.if_then_else(indices_local[0] <= max_kv_i and indices_local[0] >= 0, indices_local[0], 0) + # indices_local[0] = T.if_then_else(indices_local[0] >= 0, indices_local[0], 0) + # for v in T.vectorized(kv_vectorized): + # KV_spilt0_local[line_stride + tx // threads_per_line, + # (tx % threads_per_line) * kv_vectorized + v] = KV[b_i, indices_local[0], g_i, + # (tx % threads_per_line) * kv_vectorized + v] + # KV_spilt1_local[line_stride + tx // threads_per_line, + # (tx % threads_per_line) * kv_vectorized + v] = KV[b_i, indices_local[0], g_i, + # D_spilt + (tx % threads_per_line) * kv_vectorized + v] + # KV_spilt2_local[line_stride + tx // threads_per_line, + # (tx % threads_per_line) * kv_vectorized + v] = KV[b_i, indices_local[0], g_i, + # 2*D_spilt + (tx % threads_per_line) * kv_vectorized + v] + + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt0_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, d_i], 0) + KV_spilt1_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D_spilt + d_i], 0) + KV_spilt2_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, 2*D_spilt + d_i], 0) + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt3_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, 3*D_spilt + d_i], 0) + + for bi_i, d_i in T.Parallel(BI, D_tail): + K_tail_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D + d_i], 0) + + T.copy(KV_spilt0_local, KV_spilt0_shared) + T.copy(KV_spilt1_local, KV_spilt1_shared) + T.copy(KV_spilt2_local, KV_spilt2_shared) + + T.gemm(Q_spilt0_shared, KV_spilt0_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt1_shared, KV_spilt1_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt2_shared, KV_spilt2_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt3_shared, KV_spilt3_local, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_tail_shared, K_tail_local, acc_s, transpose_B=True, policy=gemm1_policy) + + T.copy(m_i, m_i_prev) + if gemm1_policy == T.GemmWarpPolicy.FullColK: + T.reduce_sum_warp(acc_s, acc_s, clear=False) + + T.reduce_max(acc_s, m_i, dim=1, clear=False) + for h_i in T.Parallel(H_per_block): + alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale) + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.exp2( + acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale + ) + T.reduce_sum(acc_s, sumexp_i, dim=1) + for h_i in T.Parallel(H_per_block): + sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i] + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] = acc_o0[h_i, d_i] * alpha[h_i] + acc_o1[h_i, d_i] = acc_o1[h_i, d_i] * alpha[h_i] + acc_o2[h_i, d_i] = acc_o2[h_i, d_i] * alpha[h_i] + acc_o3[h_i, d_i] = acc_o3[h_i, d_i] * alpha[h_i] + + T.copy(KV_spilt0_shared, gemm2_kv_split0_local) + T.copy(acc_s, S_shared) + T.copy(KV_spilt3_local, KV_spilt0_shared) + T.gemm(S_shared, gemm2_kv_split0_local, acc_o0, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt1_shared, acc_o1, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt2_shared, acc_o2, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt0_shared, acc_o3, k_pack=kpack, policy=gemm2_policy) + # Rescale + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] /= sumexp[h_i] + acc_o1[h_i, d_i] /= sumexp[h_i] + acc_o2[h_i, d_i] /= sumexp[h_i] + acc_o3[h_i, d_i] /= sumexp[h_i] + + if out_shared_reuse: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_oshared0, Output[b_i, s_i, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output[b_i, s_i, H0:H1, D_spilt:2*D_spilt]) + T.copy(acc_o2, acc_oshared0) + T.copy(acc_o3, acc_oshared1) + T.copy(acc_oshared0, Output[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared1, Output[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt]) + else: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared2 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared3 = T.alloc_shared([H_per_block, D_spilt], dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + acc_oshared2: tilelang.layout.make_hcu_swizzled_layout(acc_oshared2, major_pack=2), + acc_oshared3: tilelang.layout.make_hcu_swizzled_layout(acc_oshared3, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_o2, acc_oshared2) + T.copy(acc_o3, acc_oshared3) + T.copy(acc_oshared0, Output[b_i, s_i, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output[b_i, s_i, H0:H1, D_spilt:2*D_spilt]) + T.copy(acc_oshared2, Output[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared3, Output[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt]) + + @T.macro + def sparse_mla_split( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + glse: T.Tensor(glse_shape, intermediate_dtype), # type: ignore + Output_partial: T.Tensor(output_partial_shape, intermediate_dtype), # type: ignore + ): + with T.Kernel(seq_len * REPLICATE_H, batch_head * kv_group, num_split, threads=threads) as ( + bx, + by, + bz, + ): + split_idx = bz + b_i = by if kv_group == 1 else (by // kv_group) + g_i = 0 if kv_group == 1 else (by % kv_group) + s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H) + # q_i = q_start_index_s[0] + s_i + # max_kv_i = (q_i + 1 - kv_stride) // kv_stride + # kv_i = (q_i + 1 - kv_stride) // kv_stride + # max_kv_i = kv_i if (kv_i <= seq_len_kv - 1) else seq_len_kv - 1 + + H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * max_block_m) + H1 = H0 + H_per_block + + valid_NI = T.alloc_fragment([1], "int") + T.fill(valid_NI, 0) + for i_i in T.serial(NI): + first_indices = Indices[b_i, s_i, g_i, i_i * split_stride + split_idx * BI] + # if first_indices <= max_kv_i and first_indices >= 0: + if first_indices >= 0: + valid_NI[0] += 1 + + if valid_NI[0] == 0: + acc_o = T.alloc_fragment([H_per_block, D], accum_dtype) + sumexp = T.alloc_fragment([H_per_block], accum_dtype) + T.fill(sumexp, -T.infinity(accum_dtype)) + T.fill(acc_o, 0) + T.copy(sumexp, glse[b_i, s_i, split_idx, H0:H1]) + T.copy(acc_o, Output_partial[b_i, s_i, split_idx, H0:H1, :D]) + else: + Q_spilt0_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt1_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt2_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt3_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_tail_shared = T.alloc_fragment([H_per_block, D_tail], dtype) + + KV_spilt0_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt1_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt2_shared = T.alloc_shared([BI, D_spilt], dtype) + # KV_spilt3_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt0_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt1_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt2_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt3_local = T.alloc_fragment([BI, D_spilt], dtype) + + gemm2_kv_split0_local = T.alloc_fragment([BI, D_spilt], dtype) + K_tail_local = T.alloc_fragment([BI, D_tail], dtype) + + mask = T.alloc_fragment([BI], "bool") + acc_o0 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o1 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o2 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o3 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + + acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype) + S_shared = T.alloc_shared([H_per_block, BI], dtype) + sumexp = T.alloc_fragment([H_per_block], accum_dtype) + sumexp_i = T.alloc_fragment([H_per_block], accum_dtype) + alpha = T.alloc_fragment([H_per_block], accum_dtype) + m_i = T.alloc_fragment([H_per_block], accum_dtype) + m_i_prev = T.alloc_fragment([H_per_block], accum_dtype) + indices_local = T.alloc_fragment([1], indices_dtype) + indices_mask = T.alloc_fragment([BI], indices_dtype) + indices_local_1 = T.alloc_fragment([1], indices_dtype) + indices_tail = T.alloc_fragment([1], indices_dtype) + + tx = T.get_thread_binding() + T.copy(Q[b_i, s_i, H0:H1, :D_spilt], Q_spilt0_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D_spilt:2*D_spilt], Q_spilt1_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt], Q_spilt2_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt], Q_spilt3_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared) + + T.fill(acc_o0, 0) + T.fill(acc_o1, 0) + T.fill(acc_o2, 0) + T.fill(acc_o3, 0) + T.fill(sumexp, 1) + T.fill(m_i, -(2**30)) # avoid -inf - inf to cause nan + + for i_i in T.Pipelined(valid_NI[0], num_stages=num_stages): + idx_in_split = i_i * split_stride + split_idx * BI + for bi_i in T.Parallel(BI): + indices_mask[bi_i] = Indices[b_i, s_i, g_i, idx_in_split + bi_i] + # mask[bi_i] = indices_mask[bi_i] <= max_kv_i and indices_mask[bi_i] >= 0 + mask[bi_i] = indices_mask[bi_i] >= 0 + + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.if_then_else( + mask[bi_i], 0, -T.infinity(acc_s.dtype) + ) + + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt0_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, d_i], 0) + KV_spilt1_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, D_spilt + d_i], 0) + KV_spilt2_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, 2*D_spilt + d_i], 0) + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt3_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, 3*D_spilt + d_i], 0) + + for bi_i, d_i in T.Parallel(BI, D_tail): + K_tail_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, D + d_i], 0) + + T.copy(KV_spilt0_local, KV_spilt0_shared) + T.copy(KV_spilt1_local, KV_spilt1_shared) + T.copy(KV_spilt2_local, KV_spilt2_shared) + + T.gemm(Q_spilt0_shared, KV_spilt0_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt1_shared, KV_spilt1_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt2_shared, KV_spilt2_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt3_shared, KV_spilt3_local, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_tail_shared, K_tail_local, acc_s, transpose_B=True, policy=gemm1_policy) + + T.copy(m_i, m_i_prev) + if gemm1_policy == T.GemmWarpPolicy.FullColK: + T.reduce_sum_warp(acc_s, acc_s, clear=False) + T.reduce_max(acc_s, m_i, dim=1, clear=False) + + for h_i in T.Parallel(H_per_block): + alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale) + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.exp2( + acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale + ) + T.reduce_sum(acc_s, sumexp_i, dim=1) + for h_i in T.Parallel(H_per_block): + sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i] + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] = acc_o0[h_i, d_i] * alpha[h_i] + acc_o1[h_i, d_i] = acc_o1[h_i, d_i] * alpha[h_i] + acc_o2[h_i, d_i] = acc_o2[h_i, d_i] * alpha[h_i] + acc_o3[h_i, d_i] = acc_o3[h_i, d_i] * alpha[h_i] + + T.copy(KV_spilt0_shared, gemm2_kv_split0_local) + T.copy(acc_s, S_shared) + T.copy(KV_spilt3_local, KV_spilt0_shared) + T.gemm(S_shared, gemm2_kv_split0_local, acc_o0, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt1_shared, acc_o1, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt2_shared, acc_o2, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt0_shared, acc_o3, k_pack=kpack, policy=gemm2_policy) + + # Rescale + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] /= sumexp[h_i] + acc_o1[h_i, d_i] /= sumexp[h_i] + acc_o2[h_i, d_i] /= sumexp[h_i] + acc_o3[h_i, d_i] /= sumexp[h_i] + for h_i in T.Parallel(H_per_block): + sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale + + if out_shared_reuse: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_oshared0, Output_partial[b_i, s_i, split_idx, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output_partial[b_i, s_i, split_idx, H0:H1, D_spilt:2*D_spilt]) + T.copy(acc_o2, acc_oshared0) + T.copy(acc_o3, acc_oshared1) + T.copy(acc_oshared0, Output_partial[b_i, s_i, split_idx, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared1, Output_partial[b_i, s_i, split_idx, H0:H1, 3*D_spilt:4*D_spilt]) + else: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared2 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared3 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + acc_oshared2: tilelang.layout.make_hcu_swizzled_layout(acc_oshared2, major_pack=2), + acc_oshared3: tilelang.layout.make_hcu_swizzled_layout(acc_oshared3, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_o2, acc_oshared2) + T.copy(acc_o3, acc_oshared3) + T.copy(acc_oshared0, Output_partial[b_i, s_i, split_idx, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output_partial[b_i, s_i, split_idx, H0:H1, D_spilt:2*D_spilt]) + T.copy(acc_oshared2, Output_partial[b_i, s_i, split_idx, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared3, Output_partial[b_i, s_i, split_idx, H0:H1, 3*D_spilt:4*D_spilt]) + + T.copy(sumexp, glse[b_i, s_i, split_idx, H0:H1]) + + @T.macro + def combine( + glse: T.Tensor(glse_shape, intermediate_dtype), # type: ignore + Output_partial: T.Tensor(output_partial_shape, intermediate_dtype), # type: ignore + Output: T.Tensor(o_shape, dtype), # type: ignore + ): + with T.Kernel(seq_len, num_heads, batch_head, threads=128) as (bx, by, bz): + po_local = T.alloc_fragment([dim], intermediate_dtype) + o_accum_local = T.alloc_fragment([dim], accum_dtype) + # lse_local_split = T.alloc_local([1], accum_dtype) + lse_local_split = T.alloc_local([num_split], accum_dtype) + lse_logsum_local = T.alloc_local([1], accum_dtype) + lse_max_local = T.alloc_local([1], accum_dtype) + scale_local = T.alloc_local([1], accum_dtype) + + # T.annotate_layout({ + # lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i), + # }) + T.clear(lse_logsum_local) + T.clear(o_accum_local) + lse_max_local[0] = -T.infinity(accum_dtype) + for k in T.serial(num_split): + # lse_max_local[0] = T.max(lse_max_local[0], glse[bz, bx, k, by]) + lse_local_split[k] = glse[bz, bx, k, by] + lse_max_local[0] = T.max(lse_max_local[0], lse_local_split[k]) + + for k in T.Pipelined(num_split, num_stages=0): + # lse_local_split[k] = glse[bz, bx, k, by] + lse_logsum_local[0] += T.exp2(lse_local_split[k] - lse_max_local[0]) + lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0] + for k in T.serial(num_split): + for i in T.Parallel(dim): + po_local[i] = Output_partial[bz, bx, k, by, i] + # lse_local_split[0] = glse[bz, bx, k, by] + scale_local[0] = T.exp2(lse_local_split[k] - lse_logsum_local[0]) + for i in T.Parallel(dim): + o_accum_local[i] += po_local[i] * scale_local[0] + for i in T.Parallel(dim): + Output[bz, bx, by, i] = o_accum_local[i] + + @T.prim_func + def main_split( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + glse: T.Tensor(glse_shape, intermediate_dtype), # type: ignore + Output_partial: T.Tensor(output_partial_shape, intermediate_dtype), # type: ignore + Output: T.Tensor(o_shape, dtype), # type: ignore + ): + sparse_mla_split(Q, KV, Indices, glse, Output_partial) + combine(glse, Output_partial, Output) + + @T.prim_func + def main_no_split( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + glse: T.Tensor(glse_shape, intermediate_dtype), # type: ignore + Output_partial: T.Tensor(output_partial_shape, intermediate_dtype), # type: ignore + Output: T.Tensor(o_shape, dtype), # type: ignore + ): + sparse_mla(Q, KV, Indices, Output) + + if num_split_tail > 0: + @T.macro + def sparse_mla_tail_split( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + glse_tail: T.Tensor(glse_shape_tail, intermediate_dtype), # type: ignore + Output_partial_tail: T.Tensor(output_partial_shape_tail, intermediate_dtype), # type: ignore + ): + with T.Kernel(seq_len * REPLICATE_H, batch_tail * kv_group, num_split_tail, threads=threads) as ( + bx, + by, + bz, + # split_idx, + ): + split_idx = bz + b_head = batch - batch_tail + b_i = (by + b_head) if kv_group == 1 else (by // kv_group + b_head) + b_i_tail = by if kv_group == 1 else (by // kv_group) + g_i = 0 if kv_group == 1 else (by % kv_group) + s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H) + # q_i = q_start_index_s[0] + s_i + # max_kv_i = (q_i + 1 - kv_stride) // kv_stride + # kv_i = (q_i + 1 - kv_stride) // kv_stride + # max_kv_i = kv_i if (kv_i <= seq_len_kv - 1) else seq_len_kv - 1 + + H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * max_block_m) + H1 = H0 + H_per_block + + valid_NI = T.alloc_fragment([1], "int") + T.fill(valid_NI, 0) + for i_i in T.serial(NI_tail): + first_indices = Indices[b_i, s_i, g_i, i_i * split_stride_tail + split_idx * BI] + # if first_indices <= max_kv_i and first_indices >= 0: + if first_indices >= 0: + valid_NI[0] += 1 + + if valid_NI[0] == 0: + acc_o = T.alloc_fragment([H_per_block, D], accum_dtype) + sumexp = T.alloc_fragment([H_per_block], accum_dtype) + T.fill(sumexp, -T.infinity(accum_dtype)) + T.fill(acc_o, 0) + T.copy(sumexp, glse_tail[b_i_tail, s_i, split_idx, H0:H1]) + T.copy(acc_o, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, :D]) + else: + Q_spilt0_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt1_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt2_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt3_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_tail_shared = T.alloc_fragment([H_per_block, D_tail], dtype) + + KV_spilt0_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt1_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt2_shared = T.alloc_shared([BI, D_spilt], dtype) + # KV_spilt3_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt0_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt1_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt2_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt3_local = T.alloc_fragment([BI, D_spilt], dtype) + K_tail_local = T.alloc_fragment([BI, D_tail], dtype) + mask = T.alloc_fragment([BI], "bool") + + acc_o0 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o1 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o2 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o3 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + + acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype) + S_shared = T.alloc_shared([H_per_block, BI], dtype) + sumexp = T.alloc_fragment([H_per_block], accum_dtype) + sumexp_i = T.alloc_fragment([H_per_block], accum_dtype) + alpha = T.alloc_fragment([H_per_block], accum_dtype) + m_i = T.alloc_fragment([H_per_block], accum_dtype) + m_i_prev = T.alloc_fragment([H_per_block], accum_dtype) + indices_local = T.alloc_local([1], indices_dtype) + indices_mask = T.alloc_fragment([BI], indices_dtype) + indices_local_1 = T.alloc_fragment([1], indices_dtype) + + tx = T.get_thread_binding() + T.copy(Q[b_i, s_i, H0:H1, :D_spilt], Q_spilt0_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D_spilt:2*D_spilt], Q_spilt1_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt], Q_spilt2_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt], Q_spilt3_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared) + + T.fill(acc_o0, 0) + T.fill(acc_o1, 0) + T.fill(acc_o2, 0) + T.fill(acc_o3, 0) + T.fill(sumexp, 1) + T.fill(m_i, -(2**30)) # avoid -inf - inf to cause nan + + for i_i in T.Pipelined(valid_NI[0], num_stages=num_stages): + idx_in_split = i_i * split_stride_tail + split_idx * BI + for bi_i in T.Parallel(BI): + indices_mask[bi_i] = Indices[b_i, s_i, g_i, idx_in_split + bi_i] + # mask[bi_i] = indices_mask[bi_i] <= max_kv_i and indices_mask[bi_i] >= 0 + mask[bi_i] = indices_mask[bi_i] >= 0 + + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.if_then_else( + mask[bi_i], 0, -T.infinity(acc_s.dtype) + ) + + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt0_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, d_i], 0) + KV_spilt1_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, D_spilt + d_i], 0) + KV_spilt2_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, 2*D_spilt + d_i], 0) + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt3_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, 3*D_spilt + d_i], 0) + + for bi_i, d_i in T.Parallel(BI, D_tail): + K_tail_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, idx_in_split + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, idx_in_split + bi_i], g_i, D + d_i], 0) + T.copy(KV_spilt0_local, KV_spilt0_shared) + T.copy(KV_spilt1_local, KV_spilt1_shared) + T.copy(KV_spilt2_local, KV_spilt2_shared) + + T.gemm(Q_spilt0_shared, KV_spilt0_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt1_shared, KV_spilt1_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt2_shared, KV_spilt2_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt3_shared, KV_spilt3_local, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_tail_shared, K_tail_local, acc_s, transpose_B=True, policy=gemm1_policy) + + T.copy(m_i, m_i_prev) + if gemm1_policy == T.GemmWarpPolicy.FullColK: + T.reduce_sum_warp(acc_s, acc_s, clear=False) + T.reduce_max(acc_s, m_i, dim=1, clear=False) + for h_i in T.Parallel(H_per_block): + alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale) + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.exp2( + acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale + ) + T.reduce_sum(acc_s, sumexp_i, dim=1) + for h_i in T.Parallel(H_per_block): + sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i] + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] = acc_o0[h_i, d_i] * alpha[h_i] + acc_o1[h_i, d_i] = acc_o1[h_i, d_i] * alpha[h_i] + acc_o2[h_i, d_i] = acc_o2[h_i, d_i] * alpha[h_i] + acc_o3[h_i, d_i] = acc_o3[h_i, d_i] * alpha[h_i] + + T.copy(acc_s, S_shared) + T.gemm(S_shared, KV_spilt0_shared, acc_o0, k_pack=kpack, policy=gemm2_policy) + T.copy(KV_spilt3_local, KV_spilt0_shared) + T.gemm(S_shared, KV_spilt1_shared, acc_o1, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt2_shared, acc_o2, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt0_shared, acc_o3, k_pack=kpack, policy=gemm2_policy) + + # Rescale + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] /= sumexp[h_i] + acc_o1[h_i, d_i] /= sumexp[h_i] + acc_o2[h_i, d_i] /= sumexp[h_i] + acc_o3[h_i, d_i] /= sumexp[h_i] + + if out_shared_reuse: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_oshared0, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, D_spilt:2*D_spilt]) + T.copy(acc_o2, acc_oshared0) + T.copy(acc_o3, acc_oshared1) + T.copy(acc_oshared0, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared1, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, 3*D_spilt:4*D_spilt]) + else: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared2 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + acc_oshared3 = T.alloc_shared([H_per_block, D_spilt], intermediate_dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + acc_oshared2: tilelang.layout.make_hcu_swizzled_layout(acc_oshared2, major_pack=2), + acc_oshared3: tilelang.layout.make_hcu_swizzled_layout(acc_oshared3, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_o2, acc_oshared2) + T.copy(acc_o3, acc_oshared3) + T.copy(acc_oshared0, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, D_spilt:2*D_spilt]) + T.copy(acc_oshared2, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared3, Output_partial_tail[b_i_tail, s_i, split_idx, H0:H1, 3*D_spilt:4*D_spilt]) + + for h_i in T.Parallel(H_per_block): + sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale + T.copy(sumexp, glse_tail[b_i_tail, s_i, split_idx, H0:H1]) + + @T.macro + def sparse_mla_head_no_split( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + Output: T.Tensor(o_shape, dtype), # type: ignore + ): + with T.Kernel(seq_len * REPLICATE_H, batch - batch_tail, kv_group, threads=threads) as ( + bx, + by, + bz, + ): + Q_spilt0_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt1_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt2_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_spilt3_shared = T.alloc_fragment([H_per_block, D_spilt], dtype) + Q_tail_shared = T.alloc_fragment([H_per_block, D_tail], dtype) + + KV_spilt0_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt1_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_spilt2_shared = T.alloc_shared([BI, D_spilt], dtype) + KV_split0_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_split1_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_split2_local = T.alloc_fragment([BI, D_spilt], dtype) + KV_spilt3_local = T.alloc_fragment([BI, D_spilt], dtype) + K_tail_local = T.alloc_fragment([BI, D_tail], dtype) + mask = T.alloc_fragment([BI], "bool") + + acc_o0 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o1 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o2 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + acc_o3 = T.alloc_fragment([H_per_block, D_spilt], accum_dtype) + + acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype) + S_shared = T.alloc_shared([H_per_block, BI], dtype) + sumexp = T.alloc_fragment([H_per_block], accum_dtype) + sumexp_i = T.alloc_fragment([H_per_block], accum_dtype) + alpha = T.alloc_fragment([H_per_block], accum_dtype) + m_i = T.alloc_fragment([H_per_block], accum_dtype) + m_i_prev = T.alloc_fragment([H_per_block], accum_dtype) + indices_local = T.alloc_local([1], indices_dtype) + indices_mask = T.alloc_fragment([BI], indices_dtype) + indices_local_1 = T.alloc_fragment([1], indices_dtype) + valid_NI = T.alloc_fragment([1], "int") + + b_i, g_i = by, bz + s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H) + # q_i = q_start_index_s[0] + s_i + # max_kv_i = (q_i + 1 - kv_stride) // kv_stride + # kv_i = (q_i + 1 - kv_stride) // kv_stride + # max_kv_i = kv_i if (kv_i <= seq_len_kv - 1) else seq_len_kv - 1 + + H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * max_block_m) + H1 = H0 + H_per_block + + tx = T.get_thread_binding() + T.copy(Q[b_i, s_i, H0:H1, :D_spilt], Q_spilt0_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D_spilt:2*D_spilt], Q_spilt1_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt], Q_spilt2_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt], Q_spilt3_shared, coalesced_width=kv_vectorized) + T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared) + + T.fill(acc_o0, 0) + T.fill(acc_o1, 0) + T.fill(acc_o2, 0) + T.fill(acc_o3, 0) + T.fill(sumexp, 1) + T.fill(m_i, -(2**30)) # avoid -inf - inf to cause nan + T.fill(valid_NI, 0) + + for i_i in T.serial(NI): + first_indices = Indices[b_i, s_i, g_i, i_i * BI] + # if first_indices <= max_kv_i and first_indices >= 0: + if first_indices >= 0: + valid_NI[0] += 1 + + for i_i in T.Pipelined(valid_NI[0], num_stages=num_stages): + for bi_i in T.Parallel(BI): + indices_mask[bi_i] = Indices[b_i, s_i, g_i, i_i * BI + bi_i] + mask[bi_i] = indices_mask[bi_i] >= 0 + + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.if_then_else( + mask[bi_i], 0, -T.infinity(acc_s.dtype) + ) + + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt0_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, d_i], 0) + KV_spilt1_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D_spilt + d_i], 0) + KV_spilt2_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, 2*D_spilt + d_i], 0) + for bi_i, d_i in T.Parallel(BI, D_spilt): + KV_spilt3_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, 3*D_spilt + d_i], 0) + + for bi_i, d_i in T.Parallel(BI, D_tail): + K_tail_local[bi_i, d_i] = T.if_then_else(Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0, + KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D + d_i], 0) + + T.copy(KV_spilt0_local, KV_spilt0_shared) + T.copy(KV_spilt1_local, KV_spilt1_shared) + T.copy(KV_spilt2_local, KV_spilt2_shared) + T.gemm(Q_spilt0_shared, KV_spilt0_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt1_shared, KV_spilt1_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt2_shared, KV_spilt2_shared, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_spilt3_shared, KV_spilt3_local, acc_s, transpose_B=True, k_pack=kpack, policy=gemm1_policy) + T.gemm(Q_tail_shared, K_tail_local, acc_s, transpose_B=True, policy=gemm1_policy) + + T.copy(m_i, m_i_prev) + if gemm1_policy == T.GemmWarpPolicy.FullColK: + T.reduce_sum_warp(acc_s, acc_s, clear=False) + T.reduce_max(acc_s, m_i, dim=1, clear=False) + for h_i in T.Parallel(H_per_block): + alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale) + for h_i, bi_i in T.Parallel(H_per_block, BI): + acc_s[h_i, bi_i] = T.exp2( + acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale + ) + T.reduce_sum(acc_s, sumexp_i, dim=1) + for h_i in T.Parallel(H_per_block): + sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i] + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] = acc_o0[h_i, d_i] * alpha[h_i] + acc_o1[h_i, d_i] = acc_o1[h_i, d_i] * alpha[h_i] + acc_o2[h_i, d_i] = acc_o2[h_i, d_i] * alpha[h_i] + acc_o3[h_i, d_i] = acc_o3[h_i, d_i] * alpha[h_i] + + T.copy(acc_s, S_shared) + T.gemm(S_shared, KV_spilt0_shared, acc_o0, k_pack=kpack, policy=gemm2_policy) + T.copy(KV_spilt3_local, KV_spilt0_shared) + T.gemm(S_shared, KV_spilt1_shared, acc_o1, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt2_shared, acc_o2, k_pack=kpack, policy=gemm2_policy) + T.gemm(S_shared, KV_spilt0_shared, acc_o3, k_pack=kpack, policy=gemm2_policy) + # Rescale + for h_i, d_i in T.Parallel(H_per_block, D_spilt): + acc_o0[h_i, d_i] /= sumexp[h_i] + acc_o1[h_i, d_i] /= sumexp[h_i] + acc_o2[h_i, d_i] /= sumexp[h_i] + acc_o3[h_i, d_i] /= sumexp[h_i] + + if out_shared_reuse: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_oshared0, Output[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared1, Output[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt]) + T.copy(acc_o2, acc_oshared0) + T.copy(acc_o3, acc_oshared1) + T.copy(acc_oshared0, Output[b_i, s_i, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output[b_i, s_i, H0:H1, D_spilt:2*D_spilt]) + else: + acc_oshared0 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared1 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared2 = T.alloc_shared([H_per_block, D_spilt], dtype) + acc_oshared3 = T.alloc_shared([H_per_block, D_spilt], dtype) + T.annotate_layout({ + acc_oshared0: tilelang.layout.make_hcu_swizzled_layout(acc_oshared0, major_pack=2), + acc_oshared1: tilelang.layout.make_hcu_swizzled_layout(acc_oshared1, major_pack=2), + acc_oshared2: tilelang.layout.make_hcu_swizzled_layout(acc_oshared2, major_pack=2), + acc_oshared3: tilelang.layout.make_hcu_swizzled_layout(acc_oshared3, major_pack=2), + }) + T.copy(acc_o0, acc_oshared0) + T.copy(acc_o1, acc_oshared1) + T.copy(acc_o2, acc_oshared2) + T.copy(acc_o3, acc_oshared3) + T.copy(acc_oshared0, Output[b_i, s_i, H0:H1, :D_spilt]) + T.copy(acc_oshared1, Output[b_i, s_i, H0:H1, D_spilt:2*D_spilt]) + T.copy(acc_oshared2, Output[b_i, s_i, H0:H1, 2*D_spilt:3*D_spilt]) + T.copy(acc_oshared3, Output[b_i, s_i, H0:H1, 3*D_spilt:4*D_spilt]) + + @T.macro + def combine_tail( + glse_tail: T.Tensor(glse_shape_tail, intermediate_dtype), + Output_partial_tail: T.Tensor(output_partial_shape_tail, intermediate_dtype), + Output: T.Tensor(o_shape, dtype), + ): + with T.Kernel(seq_len, num_heads, batch_tail, threads=128) as (bx, by, bz): + po_local = T.alloc_fragment([dim], intermediate_dtype) + o_accum_local = T.alloc_fragment([dim], accum_dtype) + lse_local_split = T.alloc_local([num_split_tail], accum_dtype) + lse_logsum_local = T.alloc_local([1], accum_dtype) + lse_max_local = T.alloc_local([1], accum_dtype) + scale_local = T.alloc_local([1], accum_dtype) + + T.clear(lse_logsum_local) + T.clear(o_accum_local) + + b_head = batch - batch_tail + out_bz = bz + b_head + lse_max_local[0] = -T.infinity(accum_dtype) + for k in T.serial(num_split_tail): + lse_local_split[k] = glse_tail[bz, bx, k, by] + lse_max_local[0] = T.max(lse_max_local[0], lse_local_split[k]) + for k in T.Pipelined(num_split_tail, num_stages=0): + lse_logsum_local[0] += T.exp2(lse_local_split[k] - lse_max_local[0]) + lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0] + for k in T.serial(num_split_tail): + for i in T.Parallel(dim): + po_local[i] = Output_partial_tail[bz, bx, k, by, i] + scale_local[0] = T.exp2(lse_local_split[k] - lse_logsum_local[0]) + for i in T.Parallel(dim): + o_accum_local[i] += po_local[i] * scale_local[0] + for i in T.Parallel(dim): + Output[out_bz, bx, by, i] = o_accum_local[i] + + @T.macro + def combine_all( + glse: T.Tensor(glse_shape, intermediate_dtype), + Output_partial: T.Tensor(output_partial_shape, intermediate_dtype), + glse_tail: T.Tensor(glse_shape_tail, intermediate_dtype), + Output_partial_tail: T.Tensor(output_partial_shape_tail, intermediate_dtype), + Output: T.Tensor(o_shape, dtype), + ): + with T.Kernel(seq_len, num_heads, batch, threads=128) as (bx, by, bz): + po_local = T.alloc_fragment([dim], intermediate_dtype) + o_accum_local = T.alloc_fragment([dim], accum_dtype) + lse_logsum_local = T.alloc_local([1], accum_dtype) + lse_max_local = T.alloc_local([1], accum_dtype) + scale_local = T.alloc_local([1], accum_dtype) + + T.clear(lse_logsum_local) + T.clear(o_accum_local) + + if bz < batch_head: + lse_local_split = T.alloc_local([num_split], accum_dtype) + lse_max_local[0] = -T.infinity(accum_dtype) + for k in T.serial(num_split): + lse_local_split[k] = glse[bz, bx, k, by] + lse_max_local[0] = T.max(lse_max_local[0], lse_local_split[k]) + for k in T.Pipelined(num_split, num_stages=0): + lse_logsum_local[0] += T.exp2(lse_local_split[k] - lse_max_local[0]) + lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0] + for k in T.serial(num_split): + for i in T.Parallel(dim): + po_local[i] = Output_partial[bz, bx, k, by, i] + scale_local[0] = T.exp2(lse_local_split[k] - lse_logsum_local[0]) + for i in T.Parallel(dim): + o_accum_local[i] += po_local[i] * scale_local[0] + for i in T.Parallel(dim): + Output[bz, bx, by, i] = o_accum_local[i] + else: + bz_tail = bz - batch_head + lse_max_local[0] = -T.infinity(accum_dtype) + lse_local_split = T.alloc_local([num_split_tail], accum_dtype) + for k in T.serial(num_split_tail): + lse_local_split[k] = glse_tail[bz_tail, bx, k, by] + lse_max_local[0] = T.max(lse_max_local[0], lse_local_split[k]) + for k in T.Pipelined(num_split_tail, num_stages=0): + lse_logsum_local[0] += T.exp2(lse_local_split[k] - lse_max_local[0]) + lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0] + for k in T.serial(num_split_tail): + for i in T.Parallel(dim): + po_local[i] = Output_partial_tail[bz_tail, bx, k, by, i] + scale_local[0] = T.exp2(lse_local_split[k] - lse_logsum_local[0]) + for i in T.Parallel(dim): + o_accum_local[i] += po_local[i] * scale_local[0] + for i in T.Parallel(dim): + Output[bz, bx, by, i] = o_accum_local[i] + + @T.prim_func + def main_all_split( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + glse: T.Tensor(glse_shape, intermediate_dtype), # type: ignore + Output_partial: T.Tensor(output_partial_shape, intermediate_dtype), # type: ignore + glse_tail: T.Tensor(glse_shape_tail, intermediate_dtype), # type: ignore + Output_partial_tail: T.Tensor(output_partial_shape_tail, intermediate_dtype), # type: ignore + Output: T.Tensor(o_shape, dtype), # type: ignore + ): + sparse_mla_split(Q, KV, Indices, glse, Output_partial) + sparse_mla_tail_split(Q, KV, Indices, glse_tail, Output_partial_tail) + combine_all(glse, Output_partial, glse_tail, Output_partial_tail, Output) + + @T.prim_func + def main_tail_split( + Q: T.Tensor(q_shape, dtype), # type: ignore + KV: T.Tensor(kv_shape, dtype), # type: ignore + Indices: T.Tensor(indices_shape, indices_dtype), # type: ignore + glse: T.Tensor(glse_shape, intermediate_dtype), # type: ignore + Output_partial: T.Tensor(output_partial_shape, intermediate_dtype), # type: ignore + glse_tail: T.Tensor(glse_shape_tail, intermediate_dtype), # type: ignore + Output_partial_tail: T.Tensor(output_partial_shape_tail, intermediate_dtype), # type: ignore + Output: T.Tensor(o_shape, dtype), # type: ignore + ): + sparse_mla_head_no_split(Q, KV, Indices, Output) + sparse_mla_tail_split(Q, KV, Indices, glse_tail, Output_partial_tail) + combine_tail(glse_tail, Output_partial_tail, Output) + + if num_split_tail > 0: + if num_split > 1 and num_split_tail > 1: + return main_all_split + elif num_split_tail > 1: + return main_tail_split + else: + assert False, "for num_split_tail > 0, at least num_split_tail need > 1" + if num_split > 1: + return main_split + else: + return main_no_split + +def get_benifit(ceil, occupancy): + if occupancy <= 1: + return 1.0 + benifit = 1.0 + if ceil >= occupancy: + benifit = 1.8 + return benifit + +@functools.lru_cache(maxsize=8192) +def get_score(key, q_heads, cu_count, max_split, split, occupancy, combine_base): + combine_score = (key * q_heads + cu_count - 1) // cu_count * combine_base * (split >> 1) + score_base = (max_split // split) + ceil = (key * split + cu_count - 1) // cu_count + benifit = get_benifit(ceil, occupancy) + + remain = ceil % occupancy + floor = ceil - remain + mla_score = (floor * score_base / benifit + remain * score_base) * (1.05 ** (ceil >> 1)) + score = mla_score + combine_score + return score + +@functools.lru_cache(maxsize=4096) +def get_best_split(key, cu_count, q_heads, max_split, combine_base, occupancy, split_base=1): + min_score = get_score(key, q_heads, cu_count, max_split, split_base, occupancy, combine_base) + num_split = split_base + + # Select splits list based on key value + if key <= 4: + splits = [16, 32] + elif key <= cu_count // 2: + splits = [1, 2, 4, 8, 16] + elif key <= cu_count: + splits = [1, 2, 4, 8] + elif key <= cu_count * 2: + splits = [1, 2, 4] + elif key <= cu_count * 4: + splits = [1, 2] + else: + splits = [1] + + # Optimized: filter splits by split_base before loop to reduce iterations + for split in splits: + if split % split_base != 0: + continue + score = get_score(key, q_heads, cu_count, max_split, split, occupancy, combine_base) + if score < min_score: + min_score = score + num_split = split + return num_split, min_score + +@functools.lru_cache(maxsize=4096) +def get_streamk_config(key, count, cu_count, q_heads, max_split, combine_base, occupancy): + tail = key % count + head = key // count * count + head_score = 0 + tail_score = 0 + num_split = 0 + num_split_tail = 0 + if head > 0: + num_split = cu_count // count + gcd = math.gcd(head // count, num_split) + num_split = num_split // gcd + if occupancy > 1: + num_split, head_score = get_best_split(head, cu_count, q_heads, max_split, combine_base, occupancy, split_base=num_split) + + if tail > 0: + num_split_tail, tail_score = get_best_split(tail, cu_count, q_heads, max_split, combine_base, occupancy) + + merged = False + if num_split == num_split_tail or num_split == 0: + # split of head and tail are the same, use splitk not streamk + head = key + tail = 0 + num_split = num_split_tail + num_split_tail = 0 + merged = True + elif num_split_tail == 1: + # not support tail no split + head = key + tail = 0 + num_split, head_score = get_best_split(key, cu_count, q_heads, max_split, combine_base, occupancy) + num_split_tail = 0 + tail_score = 0 + + if merged: + total_score = get_score(key, q_heads, cu_count, max_split, num_split, occupancy, combine_base) + else: + total_score = head_score + tail_score + + return total_score, head, num_split, num_split_tail + +@functools.lru_cache(maxsize=2048) +def get_best_streamk_config(batch, seq_len, q_heads): + # in tp mode, q_heads is 16, so we set block_M == 16, if dp mod need a new kernel for better performance + block_M = 16 if q_heads <= 16 else 32 + + replicat_H = (q_heads + block_M - 1) // block_M + seq_len_replicat = seq_len * replicat_H + key = batch * seq_len_replicat + assert key > 0, "batch * seq_len_replicat must be greater than 0" + assert cu_count > 0, "cu_count must be greater than 0" + + config_map = {} + key_min = cu_count * 2 // 16 + counts = [] + + if cu_count == 72: + counts = [72, 36, 18, 9] + key_min = 9 + config_map = config_map_cu72 + elif cu_count == 64: + counts = [64, 32, 16, 8] + key_min = 8 + + if key in config_map.keys() and seq_len_replicat == 1 and block_M == 16: + config = config_map[key] + block_I, threads, num_stages, num_split, num_split_tail, batch_head = \ + config["block_I"], config["threads"], config["num_stages"], config["num_split"], config["num_split_tail"], config["batch_head"] + logger.info(f"Using best config for batch={batch}, seq_len={seq_len}, q_heads={q_heads}, cu_count={cu_count}: " + f"block_I={block_I}, threads={threads}, num_stages={num_stages}, num_split={num_split}, num_split_tail={num_split_tail}, batch_head={batch_head}") + return block_I, threads, num_stages, num_split, num_split_tail, batch_head + + batch_head = batch + num_split_tail = 0 + threads = 256 + num_stages = 0 + block_I = 32 + combine_base = 0.04 + if len(counts) > 0 and cu_count % seq_len_replicat == 0 and key >= key_min and key <= 128: + # streamk + max_split = 128 + # when block_I = 32, occupancy is limited by lds as 2 + occupancy = 2 + min_score = get_score(key, q_heads, cu_count, max_split, 1, occupancy, combine_base) + num_split = 1 + for count in counts: + if count % seq_len_replicat != 0: + continue + score, head_, num_split_, num_split_tail_ = get_streamk_config(key, count, cu_count, q_heads, max_split, combine_base, occupancy) + # print(f"count={count}, score:{score:.3f} vs {min_score:.3f}, batch_head={head_ // seq_len_replicat}, num_split_={num_split_}, num_split_tail_={num_split_tail_}") + if score < min_score: + min_score = score + num_split = num_split_ + num_split_tail = num_split_tail_ + batch_head = head_ // seq_len_replicat + else: + # splitK + # when block_I = 32, occupancy is limited by lds as 2 + occupancy = 2 + max_split = 128 + num_split, score = get_best_split(key, cu_count, q_heads, max_split, combine_base, occupancy) + + logger.info(f"Using best config for batch={batch}, seq_len={seq_len}, q_heads={q_heads}, cu_count={cu_count}: " + f"block_I={block_I}, threads={threads}, num_stages={num_stages}, num_split={num_split}, num_split_tail={num_split_tail}, batch_head={batch_head}") + return block_I, threads, num_stages, num_split, num_split_tail, batch_head + +def get_config_fast(batch, seq_len, q_heads): + block_M = 16 if q_heads <= 16 else 32 + key = batch * seq_len * ((q_heads + block_M - 1) // block_M) + + # Original logic: find smallest power of 2 where key * power > cu_count + # Optimized: combine comparisons to reduce branching + if key > cu_count: + num_split = 1 + elif (key << 1) > cu_count: # key * 2 > cu_count + num_split = 2 + elif (key << 2) > cu_count: # key * 4 > cu_count + num_split = 4 + elif (key << 3) > cu_count: # key * 8 > cu_count + num_split = 8 + elif (key << 4) > cu_count: # key * 16 > cu_count + num_split = 16 + else: + num_split = 32 + + return (32, 256, 0, num_split, 0, batch) + +def get_best_config(batch, seq_len, q_heads): + # for now batch will always be 1 + if (seq_len <= 8 and batch <= 256) or (batch == 1 and seq_len <= 256): + return get_best_streamk_config(batch, seq_len, q_heads) + else: + return get_config_fast(batch, seq_len, q_heads) + +@functools.lru_cache(maxsize=64) +def _get_sparse_mla_fwd_kernel(heads, dim, tail_dim, topk, kv_group, sm_scale, block_I, + threads, num_stages, num_split, num_split_tail, kv_stride, dtype): + """Cached kernel creation to avoid re-executing sparse_mla_fwd function body.""" + return sparse_mla_fwd( + heads, + dim, + tail_dim, + topk, + dtype=dtype, + num_split=num_split, + num_split_tail=num_split_tail, + kv_group=kv_group, + sm_scale=sm_scale, + is_causal=True, + block_I=block_I, + num_stages=num_stages, + threads=threads, + kv_stride=kv_stride) + +def sparse_mla_fwd_interface(q, + kv, + indices, + kv_stride=1, + sm_scale=None, + d_v=512, + dtype="float16"): + is_causal = True + assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous() + batch, seq_len, heads, dim_plus_tail_dim = q.shape + _, seq_len_kv, kv_group, _ = kv.shape + + assert dim_plus_tail_dim == 576, "you should assign dim otherwise" + dim = d_v + + assert kv.shape[-1] == dim_plus_tail_dim + tail_dim = dim_plus_tail_dim - dim + assert kv.shape[0] == batch + _, _, _, topk = indices.shape + assert indices.shape == (batch, seq_len, kv_group, topk) + + # Auto-configure parameters using get_best_config + block_I, threads, num_stages, num_split, num_split_tail, batch_head = get_best_config(batch, seq_len, heads) + + # Use cached kernel creation to avoid re-executing sparse_mla_fwd function body + kernel = _get_sparse_mla_fwd_kernel( + heads, dim, tail_dim, topk, kv_group, sm_scale, block_I, + threads, num_stages, num_split, num_split_tail, kv_stride, dtype) + + return kernel, num_split, num_split_tail, batch_head + +def tilelang_sparse_fwd( + q: torch.Tensor, + kv: torch.Tensor, + indices: torch.Tensor, + sm_scale: float, + d_v: int = 512, +) -> torch.Tensor: + """ + TileLang sparse MLA forward pass interface. + + Args: + q: Query tensor of shape (S, H, DQK) + kv: Key-Value tensor of shape (SKV, HKV, DQK) + indices: Indices tensor of shape (S, HKV, topk) + sm_scale: Softmax scale factor + d_v: Value dimension (default: 512) + + Returns: + Output tensor of shape (B=1, S, H, d_v) + """ + assert q.dim() == 3 and kv.dim() == 3 and indices.dim() == 3 + # Infer dtype from input tensor + if q.dtype == torch.bfloat16: + dtype_str = "bfloat16" + elif q.dtype == torch.float16: + dtype_str = "float16" + else: + raise ValueError(f"Unsupported dtype: {q.dtype}, only bfloat16 and float16 are supported") + + # Get output shape + B = 1 + S, H, _ = q.shape + # Call sparse_mla_fwd_interface to get kernel + tilelang_kernel, num_split, num_split_tail, batch_head = sparse_mla_fwd_interface( + q.unsqueeze(0), kv.unsqueeze(0), indices.unsqueeze(0), + kv_stride=1, + sm_scale=sm_scale, + d_v=d_v, + dtype=dtype_str + ) + + intermediate_dtype = torch.float16 + # intermediate_dtype = q.dtype + tl_out = torch.empty((B, S, H, d_v), dtype=q.dtype, device=q.device) + # Allocate intermediate tensors and execute kernel + if num_split_tail > 0: + assert B > batch_head, "B must be greater than batch_head" + glse = torch.empty((batch_head, S, num_split, H), dtype=intermediate_dtype, device=q.device) + output_partial = torch.empty((batch_head, S, num_split, H, d_v), dtype=intermediate_dtype, device=q.device) + glse_tail = torch.empty((B - batch_head, S, num_split_tail, H), dtype=intermediate_dtype, device=q.device) + output_partial_tail = torch.empty((B - batch_head, S, num_split_tail, H, d_v), dtype=intermediate_dtype, device=q.device) + tilelang_kernel( + q.unsqueeze(0), kv.unsqueeze(0), indices.unsqueeze(0), glse, output_partial, glse_tail, output_partial_tail, tl_out) + else: + glse = torch.empty((B, S, num_split, H), dtype=intermediate_dtype, device=q.device) + output_partial = torch.empty((B, S, num_split, H, d_v), dtype=intermediate_dtype, device=q.device) + tilelang_kernel( + q.unsqueeze(0), kv.unsqueeze(0), indices.unsqueeze(0), glse, output_partial, tl_out) + + return tl_out + + +def ref_sparse_mla_fwd_interface(q, kv, indices, output_dtype, q_start_s_index=0, kv_stride=1, sm_scale=None, is_casual=True): + q = q.unsqueeze(0) + kv = kv.unsqueeze(0) + indices = indices.unsqueeze(0) + q = q.float() + kv = kv.float() + indices = indices.transpose(1, 2) + b, sq, h, dim_q = q.shape + b, sk, g, _ = kv.shape + + assert kv.shape[-1] == 576, "you should assign dim otherwise" + dim = 512 + k = kv + v = kv[..., :dim] + + b, _, _, dim_v = v.shape + g_index = g + h_index = h // g + compressed_casual_mask = torch.arange( + q_start_s_index, sq + q_start_s_index, dtype=torch.int32, + device="cuda").view(-1, 1) >= torch.arange( + kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1) + + indices = torch.where(indices >= 0, indices, sk) + mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1) + mask = mask[..., :-1] + mask = mask & compressed_casual_mask.view(1, 1, sq, sk) + mask[:, :, :kv_stride - 1, 0] = True + mask = mask.view(b, g_index, 1, sq, sk) + + q = q.view(b, sq, g, -1, dim_q) + score = torch.einsum("bmghd,bngd->bghmn", q, k) + sm_scale = dim_q**-0.5 if sm_scale is None else sm_scale + score = score.masked_fill(~mask, float("-inf")).mul(sm_scale) + p = score.softmax(dim=-1) + p = p.view(b, g_index, h_index, -1, sq, sk) + p = p.view(b, g, -1, sq, sk) + o = torch.einsum("bghmn,bngd->bmghd", p.type(v.dtype), v) + o = o.reshape(b, sq, h, dim_v) + return o.to(output_dtype) \ No newline at end of file diff --git a/aiter/ops/topk.py b/aiter/ops/topk.py new file mode 100644 index 0000000000000000000000000000000000000000..69cf242255ba5b00eb044ece067c1fe6d3cc24f5 --- /dev/null +++ b/aiter/ops/topk.py @@ -0,0 +1,217 @@ +# SPDX-License-Identifier: MIT + +# user interface + +from typing import List +import torch +from ..jit.core import ( + compile_ops, +) +from ..utility import dtypes +from ..jit.utils.chip_info import get_cu_num + + +@compile_ops("module_moe_utils", fc_name="biased_grouped_topk") +def biased_grouped_topk_hip( + gating_output: torch.Tensor, + correction_bias: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_grp: int, + need_renorm: bool, + routed_scaling_factor: float = 1.0, +) -> None: ... + + +@compile_ops("module_moe_utils") +def grouped_topk( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_grp: int, + need_renorm: bool, + is_softmax: bool = True, + routed_scaling_factor: float = 1.0, +) -> None: ... + + +def gen_moe_fused_gate_fake_tensor( + input: torch.Tensor, + bias: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + topk: int, + n_share_experts_fusion: int, + routed_scaling_factor: float = 1.0, +) -> List[torch.Tensor]: + output = torch.empty_like( + topk_weights, dtype=topk_weights.dtype, device=topk_weights.device + ) + + indices = torch.empty_like(topk_ids, dtype=topk_ids.dtype, device=topk_ids.device) + + return [output, indices] + + +@compile_ops("module_moe_utils", gen_fake=gen_moe_fused_gate_fake_tensor) +def moe_fused_gate( + input: torch.Tensor, + bias: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + topk: int, + num_fused_shared_experts: int, + routed_scaling_factor: float = 1.0, +) -> List[torch.Tensor]: ... + + +def biased_grouped_topk( + gating_output: torch.Tensor, + correction_bias: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + num_fused_shared_experts: int, + need_renorm: bool, + routed_scaling_factor: float = 1.0, # mul to topk_weights +): + token_num = gating_output.shape[0] + cu_num = get_cu_num() + if token_num <= cu_num * 212 and num_fused_shared_experts == 0: + return biased_grouped_topk_hip( + gating_output, + correction_bias, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + need_renorm, + 1.0, + ) + else: + topk = topk_ids.shape[1] + assert need_renorm, "Renormalization is required for moe_fused_gate." + return moe_fused_gate( + gating_output, + correction_bias, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + topk, + num_fused_shared_experts=num_fused_shared_experts, + routed_scaling_factor=routed_scaling_factor, + ) + + +# this one copied from sglang +def biased_grouped_topk_torch( + gating_output: torch.Tensor, + correction_bias: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + num_fused_shared_experts: int=0, + routed_scaling_factor: float=1.0, + return_score: bool = False, +): + scores = gating_output.to(dtypes.fp32).sigmoid() + num_token = scores.shape[0] + num_experts = scores.shape[1] + + scores_for_choice = scores.view(num_token, -1) + correction_bias.unsqueeze(0) + + group_scores = ( + scores_for_choice.view(num_token, num_expert_group, -1) + .topk(2, dim=-1)[0] + .sum(dim=-1) + ) # [n, n_group] + + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[ + 1 + ] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group) + .reshape(num_token, -1) + ) # [n, e] + tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e] + + _, topk_ids = torch.topk( + tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0 + ) + + topk_weights = scores.gather(1, topk_ids) + + if num_fused_shared_experts: + topk_ids[:, -1] = torch.randint( + low=num_experts, + high=num_experts + num_fused_shared_experts, + size=(topk_ids.size(0),), + dtype=topk_ids.dtype, + device=topk_ids.device, + ) + topk_weights[:, -1] = topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor + + if renormalize: + topk_weights_sum = ( + topk_weights.sum(dim=-1, keepdim=True) + if num_fused_shared_experts == 0 + else topk_weights[:, :-1].sum(dim=-1, keepdim=True) + ) + topk_weights = topk_weights / topk_weights_sum + + if return_score: + return topk_weights.to(dtypes.fp32), topk_ids.to(dtypes.i32), scores + else: + return topk_weights.to(dtypes.fp32), topk_ids.to(dtypes.i32) + + +# this one copied from sglang +def grouped_topk_torch( + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", +): + gating_output = gating_output.to(dtypes.fp32) + if scoring_func == "softmax": + scores = torch.softmax(gating_output, dim=-1) + elif scoring_func == "sigmoid": + scores = gating_output.sigmoid() + else: + raise ValueError(f"Scoring function '{scoring_func}' is not supported.") + + num_token = scores.shape[0] + group_scores = ( + scores.view(num_token, num_expert_group, -1).max(dim=-1).values + ) # [n, n_group] + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[ + 1 + ] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group) + .reshape(num_token, -1) + ) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] + topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + return topk_weights.to(dtypes.fp32), topk_ids.to(dtypes.i32) diff --git a/aiter/ops/topk_plain.py b/aiter/ops/topk_plain.py new file mode 100644 index 0000000000000000000000000000000000000000..d757ab47a3748be3165ac8fa9a9d79bfae2913a8 --- /dev/null +++ b/aiter/ops/topk_plain.py @@ -0,0 +1,21 @@ +# user interface + +import torch +from ..jit.core import ( + compile_ops, +) + + +@compile_ops("module_topk_plain") +def topk_plain( + x: torch.Tensor, + topk_ids: torch.Tensor, + topk_out: torch.Tensor, + topk: int, + largest: bool = True, + rowStarts: torch.Tensor = None, # 变长序列中每个批次的起始索引,形状为[batch_size] + rowEnds: torch.Tensor = None, # 变长序列中每个批次的结束索引,形状为[batch_size]。每个批次的实际长度:rowEnds[batch_id]-rowStarts[batch_id] + stride0: int = -1, + stride1: int = 1, +) -> None: + pass diff --git a/aiter/ops/topk_transform.py b/aiter/ops/topk_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..a73d12faf164aec376dabe96b5f2772be708194a --- /dev/null +++ b/aiter/ops/topk_transform.py @@ -0,0 +1,143 @@ +# user interface + +import torch +from typing import Optional +from ..jit.core import ( + compile_ops, +) + + +@compile_ops("module_topk_transform") +def fast_topk_interface( + score: torch.Tensor, + indices: torch.Tensor, + lengths: torch.Tensor, + row_starts_opt: Optional[torch.Tensor] = None, +) -> None: + pass + +@compile_ops("module_topk_transform") +def fast_topk_transform_interface( + score: torch.Tensor, + lengths: torch.Tensor, + dst_page_table: torch.Tensor, + src_page_table: torch.Tensor, + cu_seqlens_q: torch.Tensor, + row_starts_opt: Optional[torch.Tensor] = None, +) -> None: + pass + +@compile_ops("module_topk_transform") +def fast_topk_transform_ragged_interface( + score: torch.Tensor, + lengths: torch.Tensor, + topk_indices_ragged: torch.Tensor, + topk_indices_offset: torch.Tensor, + row_starts_opt: Optional[torch.Tensor] = None, +) -> None: + pass + + + +def fast_topk_v2( + score: torch.Tensor, + lengths: torch.Tensor, + topk: int, + row_starts: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Get the topk indices of the score tensor. + Args: + score: The score tensor of shape (B, L). The score tensor is the logits + between the query and the key whose layout is either ragged or paged. + row_starts is only required when the key is ragged. + lengths: The lengths tensor of shape (B) + topk: The number of topk indices to get + row_starts: The start index of each row in the score tensor of shape (B). + For each row i, topk only applies to section [row_starts[i], row_starts[i] + lengths[i]] + of the score tensor. + Returns: + The topk indices tensor of shape (B, topk) + """ + assert ( + topk == 2048 + ), "fast_topk_v2 is only optimized for deepseek v3.2 model, where topk=2048" + assert score.dim() == 2 + topk_indices = score.new_empty((score.size(0), topk), dtype=torch.int32) + fast_topk_interface(score, topk_indices, lengths, row_starts) + return topk_indices + + +def fast_topk_transform_fused( + score: torch.Tensor, + lengths: torch.Tensor, + page_table_size_1: torch.Tensor, # NOTE: page size should be 1 + cu_seqlens_q: torch.Tensor, + topk: int, + row_starts: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Get the topk indices of the score tensor and then transform the topk indices + to indices to the page table (page_size = 1) + Args: + score: The score tensor of shape (B, L). The score tensor is the logits + between the query and the key whose layout is either ragged or paged. + row_starts is only required when the key is ragged. + lengths: The lengths tensor of shape (B) + page_table_size_1: The page table tensor of shape (Batch, topk) + cu_seqlens_q: The cumulative sequence lengths tensor of shape (Batch + 1) + topk: The number of topk indices to get + row_starts: The start index of each row in the score tensor of shape (B). + For each row i, topk only applies to section [row_starts[i], row_starts[i] + lengths[i]] + of the score tensor. It's only used for cases where the key is + ragged, i.e. during extend and draft extend. + Returns: + The topk indices tensor of shape (B, topk) + """ + assert ( + topk == 2048 + ), "fast_topk_transform_fused is only optimized for deepseek v3.2 model, where topk=2048" + assert score.dim() == 2 + src_page_table = page_table_size_1 + dst_page_table = score.new_empty((score.shape[0], topk), dtype=torch.int32) + fast_topk_transform_interface( + score, lengths, dst_page_table, src_page_table, cu_seqlens_q, row_starts + ) + return dst_page_table + + +def fast_topk_transform_ragged_fused( + score: torch.Tensor, + lengths: torch.Tensor, + topk_indices_offset: torch.Tensor, # ragged kv + topk: int, + row_starts: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Get the topk indices of the score tensor and then transform the topk indices to + indices to ragged kv (non-paged). This function is only used for extend, + not including draft extend. + Args: + score: The score tensor of shape (B, L). The score tensor is the logits + between the query and the key which can be ragged or paged. + row_starts is only required when the key is ragged. + lengths: The lengths tensor of shape (B) + topk_indices_offset: The offset of topk indices in ragged kv of shape (B) + topk: The number of topk indices to get + row_starts: The start index of each row in the score tensor of shape (B). + For each row i, topk only applies to section [row_starts[i], row_starts[i] + lengths[i]] + of the score tensor. It can be None if only the fast path is triggered, + in the case of all values in lengths <= topk (not checked in the kernel, + guaranteed by the caller). + Returns: + The topk indices tensor of shape (B, topk) + """ + assert ( + topk == 2048 + ), "fast_topk_transform_ragged_fused is only optimized for deepseek v3.2 model, where topk=2048" + assert score.dim() == 2 + topk_indices_ragged = score.new_empty((score.shape[0], topk), dtype=torch.int32) + fast_topk_transform_ragged_interface( + score, lengths, topk_indices_ragged, topk_indices_offset, row_starts + ) + return topk_indices_ragged diff --git a/aiter/ops/trans_ragged_layout.py b/aiter/ops/trans_ragged_layout.py new file mode 100644 index 0000000000000000000000000000000000000000..921de1982ea48e7ade0c66f36035087939886b8f --- /dev/null +++ b/aiter/ops/trans_ragged_layout.py @@ -0,0 +1,104 @@ +import torch +import triton +import triton.language as tl + + +@triton.jit +def binary_search(value, arr_ptr, arr_length): + left = 0 + right = arr_length - 1 + + while left <= right: + mid = (left + right) // 2 + mid_value = tl.load(arr_ptr + mid) + + if mid_value <= value: + left = mid + 1 + else: + right = mid - 1 + + return left - 1 + + +@triton.jit +def _ragged_trans_kernel( + k_buffer_ptr, + v_buffer_ptr, + k_values_ptr, + v_values_ptr, + kv_indptr_ptr, + kv_indices_ptr, + B, + E_DIM, + total_tokens, + BLOCK_TOKEN: tl.constexpr, + BLOCK_E_DIM: tl.constexpr, +): + token_block_idx = tl.program_id(0) + p_token_offset = token_block_idx * BLOCK_TOKEN + + p_token_num = BLOCK_TOKEN * (p_token_offset < total_tokens) + + for local_idx in range(p_token_num): + cur_token_idx = p_token_offset + local_idx + if cur_token_idx >= total_tokens: + batch_idx = -1 + else: + batch_idx = binary_search(cur_token_idx, kv_indptr_ptr, B + 1) + if batch_idx >= 0 and batch_idx < B: + batch_token_start = tl.load(kv_indptr_ptr + batch_idx) + kv_start = tl.load(kv_indptr_ptr + batch_idx) + # kv_end = tl.load(kv_indptr_ptr + batch_idx + 1) + + local_p_token_offset = cur_token_idx - batch_token_start + E_DIM_mask = tl.arange(0, BLOCK_E_DIM) < E_DIM + + kv_idx = tl.load(kv_indices_ptr + kv_start + local_p_token_offset) + kv_buffer_off = kv_idx * E_DIM + tl.arange(0, BLOCK_E_DIM) + k_vals = tl.load(k_buffer_ptr + kv_buffer_off, mask=E_DIM_mask) + v_vals = tl.load(v_buffer_ptr + kv_buffer_off, mask=E_DIM_mask) + + tl.store( + k_values_ptr + cur_token_idx * E_DIM + tl.arange(0, BLOCK_E_DIM), + k_vals, + mask=E_DIM_mask, + ) + tl.store( + v_values_ptr + cur_token_idx * E_DIM + tl.arange(0, BLOCK_E_DIM), + v_vals, + mask=E_DIM_mask, + ) + + +def ragged_layout_trans(kv_indptr, kv_indices, k_buffer, v_buffer): + B = kv_indptr.shape[0] - 1 + H_KV = k_buffer.shape[1] + D = k_buffer.shape[2] + dtype = k_buffer.dtype + + total_tokens = kv_indptr[-1].item() + k_values = torch.empty((kv_indptr[-1], H_KV, D), dtype=dtype, device="cuda") + v_values = torch.empty((kv_indptr[-1], H_KV, D), dtype=dtype, device="cuda") + + BLOCK_TOKEN = 16 + BLOCK_E_DIM = triton.next_power_of_2(H_KV * D) + + token_blocks = triton.cdiv(total_tokens, BLOCK_TOKEN) + + grid = (token_blocks,) + + _ragged_trans_kernel[grid]( + k_buffer, + v_buffer, + k_values, + v_values, + kv_indptr, + kv_indices, + B=B, + E_DIM=H_KV * D, + total_tokens=total_tokens, + BLOCK_TOKEN=BLOCK_TOKEN, + BLOCK_E_DIM=BLOCK_E_DIM, + ) + + return k_values, v_values diff --git a/aiter/ops/triton/__init__.py b/aiter/ops/triton/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..539cc85a5a61e6090de2afbfcf31a30356cd5480 --- /dev/null +++ b/aiter/ops/triton/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: MIT + +# SPDX-License-Identifier: MIT +from . import quant + + +__all__ = [ + # "quant", +] diff --git a/aiter/ops/triton/_triton_kernels/__init__.py b/aiter/ops/triton/_triton_kernels/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..548d2d447de68422c7e6df5d1da98b2169133f1d --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: MIT diff --git a/aiter/ops/triton/_triton_kernels/attention/__init__.py b/aiter/ops/triton/_triton_kernels/attention/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..548d2d447de68422c7e6df5d1da98b2169133f1d --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/attention/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: MIT diff --git a/aiter/ops/triton/_triton_kernels/attention/fp8_mqa_logits.py b/aiter/ops/triton/_triton_kernels/attention/fp8_mqa_logits.py new file mode 100644 index 0000000000000000000000000000000000000000..3db61b02627588f958cfcd562f6bd4f8034341b5 --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/attention/fp8_mqa_logits.py @@ -0,0 +1,318 @@ +import triton +import triton.language as tl + + +def _get_autotune_configs(): + configs = [] +# for BLOCK_Q in [1, 2, 4]: + for BLOCK_KV in [64, 128, 256]: + for num_stages in [1, 2]: + for num_warps in [4, 8]: + for waves_per_eu in [0, 2]: + configs.append( + triton.Config( + { + #"BLOCK_Q": BLOCK_Q, + "BLOCK_KV": BLOCK_KV, + "waves_per_eu": waves_per_eu, + }, + num_stages=num_stages, + num_warps=num_warps, + ) + ) + return configs + + +def _get_clear_autotune_configs(): + configs = [] + for BLOCK_KV in [64, 128, 256, 512, 1024, 2048]: + for num_warps in [2, 4, 8]: + for num_stages in [1, 2]: + for waves_per_eu in [0, 2]: + configs.append( + triton.Config( + {"BLOCK_KV": BLOCK_KV, "waves_per_eu": waves_per_eu}, + num_warps=num_warps, + num_stages=num_stages, + ) + ) + return configs +#@triton.autotune( +# configs=_get_autotune_configs(), +# #configs=[ +# # triton.Config( +# # {"BLOCK_Q": 1, "BLOCK_KV": 64, "waves_per_eu": 2}, num_warps=4, num_stages=1) +# #], +# key=["NUM_HEADS", "HEAD_SIZE", "seq_len_kv"], +#) +@triton.jit +def _fp8_mqa_logits_kernel( + Q_ptr, # fp8e4m3 [seq_len, H, D] + KV_ptr, # fp8e4m3 [seq_len_kv, D] + kv_scales_ptr, # fp32 [seq_len_kv] + weights_ptr, # fp32 [seq_len, H] + cu_start_ptr, # int32 [seq_len] + cu_end_ptr, # int32 [seq_len] + logits_ptr, # fp32 [seq_len, seq_len_kv] + seq_len, + seq_len_kv, + NUM_HEADS: tl.constexpr, + HEAD_SIZE: tl.constexpr, + # strides + stride_q_s: tl.int64, + stride_q_h: tl.constexpr, + stride_q_d: tl.constexpr, + stride_kv_s: tl.int64, + stride_kv_d: tl.constexpr, + stride_w_s: tl.int64, + stride_w_h: tl.constexpr, + stride_logits_s: tl.int64, + stride_logits_k: tl.int64, + # block sizes + BLOCK_KV: tl.constexpr, + LOGITS_MASKING: tl.constexpr, +): + row_id = tl.program_id(0) + # go from larger to smaller in terms of work + # to reduce the tail effect + row_id = tl.num_programs(0) - row_id - 1 + tl.assume(row_id >= 0) + tl.assume(stride_q_s > 0) + tl.assume(stride_q_h > 0) + tl.assume(stride_q_d > 0) + tl.assume(stride_kv_s > 0) + tl.assume(stride_kv_d > 0) + tl.assume(stride_w_s > 0) + tl.assume(stride_w_h > 0) + + logits_row_ptrs = logits_ptr + row_id * stride_logits_s + + h_inds = tl.arange(0, NUM_HEADS)[:, None] + d_inds = tl.arange(0, HEAD_SIZE) + + # load Q[BLOCK_Q, NUM_HEADS, HEAD_SIZE] + q_ptrs = ( + Q_ptr + row_id * stride_q_s + h_inds * stride_q_h + d_inds[None, :] * stride_q_d + ) + + q_block = tl.load(q_ptrs, cache_modifier=".cg") + w_ptrs = weights_ptr + row_id * stride_w_s + h_inds * stride_w_h + w_block = tl.load(w_ptrs, cache_modifier=".cg").to(tl.float32) + + # Load start/end for each row in this block + start_ind = tl.load(cu_start_ptr + row_id) + end_ind = tl.load(cu_end_ptr + row_id) + + if LOGITS_MASKING: + start_ind = tl.maximum(start_ind, 0) + end_ind = tl.minimum(end_ind, seq_len_kv) + else: + start_ind = tl.maximum(start_ind, 0) // BLOCK_KV * BLOCK_KV + end_ind = tl.cdiv(tl.minimum(end_ind, seq_len_kv), BLOCK_KV) * BLOCK_KV + shifted_end = end_ind - start_ind + shifted_unmasked_end = shifted_end // BLOCK_KV * BLOCK_KV + + kv_col_offsets = tl.arange(0, BLOCK_KV) + start_ind + kv_ptrs = ( + KV_ptr + kv_col_offsets[None, :] * stride_kv_s + d_inds[:, None] * stride_kv_d + ) + + kv_scales_ptrs = kv_scales_ptr + kv_col_offsets + + logits_ptrs = logits_row_ptrs + kv_col_offsets * stride_logits_k + + # Loop over KV tiles + for _ in tl.range(0, shifted_unmasked_end, BLOCK_KV): + kv_block = tl.load(kv_ptrs) + kv_scales = tl.load(kv_scales_ptrs) + + # [NUM_HEADS, BLOCK_KV] = [NUM_HEADS, HEAD_SIZE] x [HEAD_SIZE, BLOCK_KV] + scores = tl.dot(q_block, kv_block, input_precision="ieee") + # Multiply by kv_scales (broadcast along rows) + scores = scores * kv_scales[None, :] + # ReLU + scores = tl.maximum(scores, 0.0) + scores = scores * w_block + # [NUM_HEADS, BLOCK_KV] -> [BLOCK_KV, ] + scores = tl.sum(scores, axis=0) + tl.store(logits_ptrs, scores) + + kv_ptrs += BLOCK_KV * stride_kv_s + kv_scales_ptrs += BLOCK_KV + logits_ptrs += BLOCK_KV * stride_logits_k + kv_col_offsets += BLOCK_KV + + # masked load + if LOGITS_MASKING: + kv_col_mask = kv_col_offsets < end_ind + kv_block = tl.load(kv_ptrs, mask=kv_col_mask[None, :], other=0.0) + kv_scales = tl.load(kv_scales_ptrs, mask=kv_col_mask, other=0.0) + + # [NUM_HEADS, BLOCK_KV] = [NUM_HEADS, HEAD_SIZE] x [HEAD_SIZE, BLOCK_KV] + scores = tl.dot(q_block, kv_block, input_precision="ieee") + # Multiply by kv_scales (broadcast along rows) + scores = scores * kv_scales[None, :] + # ReLU + scores = tl.maximum(scores, 0.0) + scores = scores * w_block + # [NUM_HEADS, BLOCK_KV] -> [BLOCK_KV, ] + scores = tl.sum(scores, axis=0) + # masked store + in_window = (kv_col_offsets >= start_ind) & (kv_col_offsets < end_ind) + tl.store(logits_ptrs, scores, mask=in_window) + + +#@triton.autotune( +# configs=_get_autotune_configs(), +# #configs=[ +# # triton.Config( +# # {"BLOCK_Q": 1, "BLOCK_KV": 64, "waves_per_eu": 2}, num_warps=4, num_stages=1) +# #], +# key=["NUM_HEADS", "HEAD_SIZE", "seq_len_kv"], +#) +@triton.jit +def _fp8_mqa_logits_kernel_grouped( + Q_ptr, # fp8e4m3 [seq_len, H, D] + KV_ptr, # fp8e4m3 [seq_len_kv, D] + kv_scales_ptr, # fp32 [seq_len_kv] + weights_ptr, # fp32 [seq_len, H] + cu_start_ptr, # int32 [seq_len] + cu_end_ptr, # int32 [seq_len] + logits_ptr, # fp32 [seq_len, seq_len_kv] + seq_len, + seq_len_kv, + NUM_HEADS: tl.constexpr, + HEAD_SIZE: tl.constexpr, + # strides + stride_q_s: tl.int64, + stride_q_h: tl.constexpr, + stride_q_d: tl.constexpr, + stride_kv_s: tl.int64, + stride_kv_d: tl.constexpr, + stride_w_s: tl.int64, + stride_w_h: tl.constexpr, + stride_logits_s: tl.int64, + stride_logits_k: tl.int64, + # block sizes + BLOCK_Q: tl.constexpr, + BLOCK_KV: tl.constexpr, +): + block_id = tl.program_id(0) + # go from larger to smaller in terms of work + # to reduce the tail effect + block_id = tl.num_programs(0) - block_id - 1 + tl.assume(block_id >= 0) + tl.assume(stride_q_s > 0) + tl.assume(stride_q_h > 0) + tl.assume(stride_q_d > 0) + tl.assume(stride_kv_s > 0) + tl.assume(stride_kv_d > 0) + tl.assume(stride_w_s > 0) + tl.assume(stride_w_h > 0) + + seq_start = block_id * BLOCK_Q + + # Scalar-load cu_start/cu_end for each row in the block and reduce to the + # union range [cu_k_s_min, cu_k_e_max). BLOCK_Q is a compile-time constant + # (typically 1/2/4), so tl.static_range unrolls this completely. + cu_k_s_min = seq_len_kv + cu_k_e_max = 0 + for bq_i in tl.static_range(BLOCK_Q): + s_i = tl.load(cu_start_ptr + seq_start + bq_i) + e_i = tl.load(cu_end_ptr + seq_start + bq_i) + s_i = tl.maximum(s_i, 0) + e_i = tl.minimum(e_i, seq_len_kv) + cu_k_s_min = tl.minimum(cu_k_s_min, s_i) + cu_k_e_max = tl.maximum(cu_k_e_max, e_i) + cu_k_s_min = tl.maximum(cu_k_s_min, 0) + cu_k_e_max = tl.minimum(cu_k_e_max, seq_len_kv) + + # Round up to full BLOCK_KV blocks — the clear kernel handles out-of-window positions. + total_blocks = tl.cdiv(tl.maximum(cu_k_e_max - cu_k_s_min, 0), BLOCK_KV) + + h_inds = tl.arange(0, NUM_HEADS)[:, None] + d_inds = tl.arange(0, HEAD_SIZE) + + kv_col_offsets = tl.arange(0, BLOCK_KV) + cu_k_s_min + kv_ptrs = ( + KV_ptr + kv_col_offsets[None, :] * stride_kv_s + d_inds[:, None] * stride_kv_d + ) + kv_scales_ptrs = kv_scales_ptr + kv_col_offsets + + # Loop over all KV tiles (including partial tail) — no masking needed, + # _fp8_mqa_clear_logits_kernel will set invalid positions to -inf. + for _ in tl.range(0, total_blocks): + kv_block = tl.load(kv_ptrs) + kv_scales = tl.load(kv_scales_ptrs) + + for bq_i in tl.static_range(BLOCK_Q): + q_ptrs = ( + Q_ptr + (seq_start + bq_i) * stride_q_s + + h_inds * stride_q_h + d_inds[None, :] * stride_q_d + ) + q_block = tl.load(q_ptrs, cache_modifier=".cg") + + w_ptrs = ( + weights_ptr + (seq_start + bq_i) * stride_w_s + + h_inds * stride_w_h + ) + w_block = tl.load(w_ptrs, cache_modifier=".cg").to(tl.float32) + + # [NUM_HEADS, BLOCK_KV] = [NUM_HEADS, HEAD_SIZE] x [HEAD_SIZE, BLOCK_KV] + scores = tl.dot(q_block, kv_block, input_precision="ieee") + scores = scores * kv_scales[None, :] + # ReLU + scores = tl.maximum(scores, 0.0) + scores = scores * w_block + # [NUM_HEADS, BLOCK_KV] -> [BLOCK_KV] + scores = tl.sum(scores, axis=0) + + logits_ptrs = ( + logits_ptr + (seq_start + bq_i) * stride_logits_s + + kv_col_offsets * stride_logits_k + ) + tl.store(logits_ptrs, scores) + + kv_ptrs += BLOCK_KV * stride_kv_s + kv_scales_ptrs += BLOCK_KV + kv_col_offsets += BLOCK_KV + + +#@triton.autotune( +# configs=_get_clear_autotune_configs(), +# key=["seq_len_kv"], +#) +@triton.jit +def _fp8_mqa_clear_logits_kernel( + logits_ptr, # fp32 [seq_len, seq_len_kv] + cu_start_ptr, # int32 [seq_len] + cu_end_ptr, # int32 [seq_len] + seq_len_kv, + stride_logits_s: tl.int64, + stride_logits_k: tl.int64, + BLOCK_KV: tl.constexpr, +): + """Standalone kernel to set logits outside each row's valid + [cu_start, cu_end) window to -inf. Grid: (seq_len,).""" + row_id = tl.program_id(0) + + cu_start = tl.load(cu_start_ptr + row_id) + cu_end = tl.load(cu_end_ptr + row_id) + + kv_offsets = tl.arange(0, BLOCK_KV) + + for _ in tl.range(0, tl.cdiv(seq_len_kv, BLOCK_KV)): + invalid = (kv_offsets < cu_start) | (kv_offsets >= cu_end) + in_bounds = kv_offsets < seq_len_kv + store_mask = invalid & in_bounds + + logits_ptrs = ( + logits_ptr + row_id * stride_logits_s + kv_offsets * stride_logits_k + ) + tl.store( + logits_ptrs, + tl.full([BLOCK_KV], float("-inf"), dtype=tl.float32), + mask=store_mask, + ) + + kv_offsets += BLOCK_KV diff --git a/aiter/ops/triton/_triton_kernels/attention/pa_mqa_logits.py b/aiter/ops/triton/_triton_kernels/attention/pa_mqa_logits.py new file mode 100644 index 0000000000000000000000000000000000000000..e22309f239d6a3ae637004686a6a108efd005906 --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/attention/pa_mqa_logits.py @@ -0,0 +1,533 @@ +import triton +import triton.language as tl + + +@triton.jit +def _sum_combine(a, b): + return a + b + + +@triton.jit +def _deepgemm_fp8_paged_mqa_logits_stage1_ragged_k( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + prefix_sum_context_lens, + kv_indices, + weights, + stride_w_batch, + Out_buffer, + stride_out_heads, + stride_out_batch, + max_model_len, + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + SplitKV: tl.constexpr = 1, +): + pid = tl.program_id(0) + num_block_q_head = tl.cdiv(heads_num, ChunkQ) + + pid_q_head, remain_pid = pid % num_block_q_head, pid // num_block_q_head + pid_next_n, remain_pid = remain_pid % next_n, remain_pid // next_n + pid_batch, pid_split_kv = remain_pid % batch_size, remain_pid // batch_size + + context_start = tl.load(prefix_sum_context_lens + pid_batch) + context_end = tl.load(prefix_sum_context_lens + pid_batch + 1) + + context_length = context_end - context_start + context_chunk_num = tl.cdiv(context_length, ChunkK) + split_context_chunk_num = tl.cdiv(context_chunk_num, SplitKV) + + split_context_start = (pid_split_kv * split_context_chunk_num) * ChunkK + split_context_length = min( + context_length - split_context_start, split_context_chunk_num * ChunkK + ) + + q = tl.load( + Q_buffer + + pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ((pid_q_head * ChunkQ + tl.arange(0, ChunkQ)) * stride_q_heads)[:, None] + + tl.arange(0, HiddenDim)[None, :], + ) + scale_weight = tl.load( + weights + + (pid_batch * next_n + pid_next_n) * stride_w_batch + + pid_q_head * ChunkQ + + tl.arange(0, ChunkQ) + ) + + for context_idx in range( + split_context_start, split_context_start + split_context_length, ChunkK + ): + mask_kv = context_idx + tl.arange(0, ChunkK) < context_length + context_kv_idx = tl.load( + kv_indices + context_start + context_idx + tl.arange(0, ChunkK), + mask=mask_kv, + other=0, + ) + + k = tl.load( + KV_buffer + + context_kv_idx[:, None] * stride_k_seq + + tl.arange(0, HiddenDim)[None, :], + mask=mask_kv[:, None], + other=0.0, + ) + k_scale_f = tl.load(scale_buffer + context_kv_idx[:, None] * stride_scale_seq) + + o = tl.dot(q, k.T) + o = o * k_scale_f.T + o = tl.maximum(o, 0.0) + o = o * scale_weight[None, :].T + + mask = context_idx + tl.arange(0, ChunkK) <= context_length - pid_next_n + o = tl.where(mask[None, :], o, float("-inf")) + + tl.store( + Out_buffer + + (pid_batch * next_n + pid_next_n) * stride_out_batch + + (pid_q_head * ChunkQ + tl.arange(0, ChunkQ)[:, None, None]) + * stride_out_heads + + (context_idx + tl.arange(0, ChunkK)[None, None, :]), + o[:, None, :], + ) + + +@triton.jit +def _deepgemm_fp8_paged_mqa_logits_ragged_k( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + prefix_sum_context_lens, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + SplitKV: tl.constexpr = 1, +): + pid = tl.program_id(0) + num_block_q_head = tl.cdiv(heads_num, ChunkQ) + + pid_q_head, remain_pid = pid % num_block_q_head, pid // num_block_q_head + pid_next_n, remain_pid = remain_pid % next_n, remain_pid // next_n + pid_batch, pid_split_kv = remain_pid % batch_size, remain_pid // batch_size + + context_start = tl.load(prefix_sum_context_lens + pid_batch) + context_end = tl.load(prefix_sum_context_lens + pid_batch + 1) + + context_length = context_end - context_start + context_chunk_num = tl.cdiv(context_length, ChunkK) + split_context_chunk_num = tl.cdiv(context_chunk_num, SplitKV) + + split_context_start = (pid_split_kv * split_context_chunk_num) * ChunkK + split_context_length = min( + context_length - split_context_start, split_context_chunk_num * ChunkK + ) + + q = tl.load( + Q_buffer + + pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ((pid_q_head * ChunkQ + tl.arange(0, ChunkQ)) * stride_q_heads)[:, None] + + tl.arange(0, HiddenDim)[None, :], + ) + scale_weight = tl.load( + weights + + (pid_batch * next_n + pid_next_n) * stride_w_batch + + pid_q_head * ChunkQ + + tl.arange(0, ChunkQ) + ) + + for context_idx in range( + split_context_start, split_context_start + split_context_length, ChunkK + ): + mask_kv = context_idx + tl.arange(0, ChunkK) < context_length + context_kv_idx = tl.load( + kv_indices + context_start + context_idx + tl.arange(0, ChunkK), + mask=mask_kv, + other=0, + ) + + k = tl.load( + KV_buffer + + context_kv_idx[:, None] * stride_k_seq + + tl.arange(0, HiddenDim)[None, :], + mask=mask_kv[:, None], + other=0.0, + ) + k_scale_f = tl.load(scale_buffer + context_kv_idx[:, None] * stride_scale_seq) + + o = tl.dot(q, k.T) + o = o * k_scale_f.T + o = tl.maximum(o, 0.0) + o = o * scale_weight[None, :].T + + mask = context_idx + tl.arange(0, ChunkK) <= context_length - pid_next_n + o = tl.where(mask[None, :], o, float("-inf")) + + logits = tl.reduce(o, axis=0, combine_fn=_sum_combine) + tl.store( + OutLogits_buffer + + (pid_batch * next_n + pid_next_n) * stride_out_batch + + (context_idx + tl.arange(0, ChunkK)), + logits, + mask=(context_idx + tl.arange(0, ChunkK)) < max_model_len, + ) + + +@triton.jit +def _deepgemm_fp8_paged_mqa_logits_stage1( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch: tl.int64, + stride_q_next_n: tl.int64, + stride_q_heads: tl.int64, + KV_buffer, + stride_k_seq: tl.int64, + scale_buffer, + stride_scale_seq: tl.int64, + context_len_ptr, + kv_indices, + weights, + stride_w_batch: tl.int64, + Out_buffer, + stride_out_heads: tl.int64, + stride_out_batch: tl.int64, + max_model_len, + max_blk_len, + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + SplitKV: tl.constexpr = 1, +): + pid = tl.program_id(0) + num_block_q_head = tl.cdiv(heads_num, ChunkQ) + + pid_q_head, remain_pid = pid % num_block_q_head, pid // num_block_q_head + pid_next_n, remain_pid = remain_pid % next_n, remain_pid // next_n + pid_batch, pid_split_kv = remain_pid % batch_size, remain_pid // batch_size + + context_length = tl.load(context_len_ptr + pid_batch) + + context_chunk_num = tl.cdiv(context_length, ChunkK) + split_context_chunk_num = tl.cdiv(context_chunk_num, SplitKV) + + split_context_start = (pid_split_kv * split_context_chunk_num) * ChunkK + split_context_length = min( + context_length - split_context_start, split_context_chunk_num * ChunkK + ) + + q = tl.load( + Q_buffer + + pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ((pid_q_head * ChunkQ + tl.arange(0, ChunkQ)) * stride_q_heads)[:, None] + + tl.arange(0, HiddenDim)[None, :], + ) + scale_weight = tl.load( + weights + + (pid_batch * next_n + pid_next_n) * stride_w_batch + + pid_q_head * ChunkQ + + tl.arange(0, ChunkQ) + ) + + for context_idx in range( + split_context_start, split_context_start + split_context_length, ChunkK + ): + mask_kv = context_idx + tl.arange(0, ChunkK) < context_length + context_kv_idx = tl.load( + kv_indices + pid_batch * max_blk_len + context_idx + tl.arange(0, ChunkK), + mask=mask_kv, + other=0, + ) + + k = tl.load( + KV_buffer + + context_kv_idx[:, None] * stride_k_seq + + tl.arange(0, HiddenDim)[None, :], + mask=mask_kv[:, None], + other=0.0, + ) + k_scale_f = tl.load(scale_buffer + context_kv_idx[:, None] * stride_scale_seq) + + o = tl.dot(q, k.T) + o = o * k_scale_f.T + o = tl.maximum(o, 0.0) + o = o * scale_weight[None, :].T + + mask = ( + context_idx + tl.arange(0, ChunkK) <= context_length - next_n + pid_next_n + ) + o = tl.where(mask[None, :], o, float("-inf")) + + tl.store( + Out_buffer + + (pid_batch * next_n + pid_next_n) * stride_out_batch + + (pid_q_head * ChunkQ + tl.arange(0, ChunkQ)[:, None, None]) + * stride_out_heads + + (context_idx + tl.arange(0, ChunkK)[None, None, :]), + o[:, None, :], + ) + + +@triton.jit +def _deepgemm_fp8_paged_mqa_logits_varctx_schedule( + batch_size, + context_len_ptr, + safe_chunks_per_cta_ptr, + parallel_unit_num, + ChunkK: tl.constexpr, + AlignedBatchSize: tl.constexpr, + TryCount: tl.constexpr, +): + pid = tl.program_id(0) + + ctx_lens = tl.load( + context_len_ptr + tl.arange(0, AlignedBatchSize), + mask=tl.arange(0, AlignedBatchSize) < batch_size, + other=0, + ) + ctx_blks = tl.cdiv(ctx_lens, ChunkK) + + has_successed = False + safe_seg_lens = 0 + for t in range(TryCount): + try_seg_per_pu = 1 + pid * TryCount + TryCount - t + ctx_segs = tl.cdiv(ctx_blks, try_seg_per_pu) + total_segs = tl.sum(ctx_segs) + + if total_segs <= parallel_unit_num: + has_successed = True + elif has_successed: + safe_seg_lens = try_seg_per_pu + 1 + has_successed = False + + try_seg_per_pu = 1 + pid * TryCount + ctx_segs = tl.cdiv(ctx_blks, try_seg_per_pu) + total_segs = tl.sum(ctx_segs) + + if has_successed: + if total_segs > parallel_unit_num: + safe_seg_lens = try_seg_per_pu + 1 + elif try_seg_per_pu == 1: + safe_seg_lens = 1 + + if safe_seg_lens != 0: + tl.store(safe_chunks_per_cta_ptr, safe_seg_lens) + + +@triton.jit +def _deepgemm_fp8_paged_mqa_logits( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + context_len_ptr, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + max_blk_len, + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + SplitKV: tl.constexpr = 1, +): + pid = tl.program_id(0) + num_block_q_head = tl.cdiv(heads_num, ChunkQ) + + pid_q_head, remain_pid = pid % num_block_q_head, pid // num_block_q_head + pid_next_n, remain_pid = remain_pid % next_n, remain_pid // next_n + pid_batch, pid_split_kv = remain_pid % batch_size, remain_pid // batch_size + + context_length = tl.load(context_len_ptr + pid_batch) + + context_chunk_num = tl.cdiv(context_length, ChunkK) + split_context_chunk_num = tl.cdiv(context_chunk_num, SplitKV) + + split_context_start = (pid_split_kv * split_context_chunk_num) * ChunkK + split_context_length = min( + context_length - split_context_start, split_context_chunk_num * ChunkK + ) + + q = tl.load( + Q_buffer + + pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ((pid_q_head * ChunkQ + tl.arange(0, ChunkQ)) * stride_q_heads)[:, None] + + tl.arange(0, HiddenDim)[None, :], + ) + scale_weight = tl.load( + weights + + (pid_batch * next_n + pid_next_n) * stride_w_batch + + pid_q_head * ChunkQ + + tl.arange(0, ChunkQ) + ) + + for context_idx in range( + split_context_start, split_context_start + split_context_length, ChunkK + ): + mask_kv = context_idx + tl.arange(0, ChunkK) < context_length + context_kv_idx = tl.load( + kv_indices + pid_batch * max_blk_len + context_idx + tl.arange(0, ChunkK), + mask=mask_kv, + other=0, + ) + + k = tl.load( + KV_buffer + + context_kv_idx[:, None] * stride_k_seq + + tl.arange(0, HiddenDim)[None, :], + mask=mask_kv[:, None], + other=0.0, + ) + k_scale_f = tl.load(scale_buffer + context_kv_idx[:, None] * stride_scale_seq) + + o = tl.dot(q, k.T) + o = o * k_scale_f.T + o = tl.maximum(o, 0.0) + o = o * scale_weight[None, :].T + + mask = ( + context_idx + tl.arange(0, ChunkK) <= context_length - next_n + pid_next_n + ) + o = tl.where(mask[None, :], o, float("-inf")) + + logits = tl.reduce(o, axis=0, combine_fn=_sum_combine) + tl.store( + OutLogits_buffer + + (pid_batch * next_n + pid_next_n) * stride_out_batch + + (context_idx + tl.arange(0, ChunkK)), + logits, + mask=(context_idx + tl.arange(0, ChunkK)) < max_model_len, + ) + + +@triton.jit +def _gluon_deepgemm_fp8_paged_mqa_logits( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + context_len_ptr, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + max_block_len, + SplitKV, + dummyPointerArg, # dummy pointer for compatibility with triton3.5 on lower version + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + KVBlockSize: tl.constexpr = 1, +): + # for AOT load use, only need kernel have the same signature as implementation side + pass + + +@triton.jit +def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + context_len_ptr, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + max_block_len, + SplitKV, + dummyPointerArg, # dummy pointer for compatibility with triton3.5 on lower version + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + KVBlockSize: tl.constexpr = 16, +): + # for AOT load use, only need kernel have the same signature as implementation side + pass + + +@triton.jit +def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + context_len_ptr, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + max_block_len, + safe_chunks_per_cta_ptr, + dummyPointerArg, # dummy pointer for compatibility with triton3.5 on lower version + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + KVBlockSize: tl.constexpr = 16, +): + # for AOT load use, only need kernel have the same signature as implementation side + pass diff --git a/aiter/ops/triton/activation.py b/aiter/ops/triton/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..ff1d9c3236fd3063bfdac99e584e613c19f9fe31 --- /dev/null +++ b/aiter/ops/triton/activation.py @@ -0,0 +1,287 @@ +from .quant import _mxfp4_quant_op +from typing import Literal +import triton +import triton.language as tl +import torch + + +@triton.jit +def _silu(x): + return x * tl.sigmoid(x) + + +@triton.jit +def _silu_exp2(x): + return x / (1.0 + tl.exp2(-(x * 1.44269504089))) + + +@triton.jit +def _tanh(x): + return 2 * tl.sigmoid(2 * x) - 1 + + +@triton.jit +def _gelu(x): + M_SQRT1_2 = 0.70710678118654752440 + ALPHA = M_SQRT1_2 + return 0.5 * x * (1.0 + tl.erf(x * ALPHA)) + + +@triton.jit +def _gelu_tanh(x): + M_SQRT2 = 1.41421356237309504880 + M_2_SQRTPI = 1.12837916709551257390 + BETA = M_SQRT2 * M_2_SQRTPI * 0.5 + KAPPA = 0.044715 + x_cube = x * x * x + inner = BETA * (x + KAPPA * x_cube) + return 0.5 * x * (1.0 + _tanh(inner)) + + +# @tl.constexpr_function +@triton.jit +def _get_activation_from_str(activation: str): + mapping = { + "gelu": _gelu, + "gelu_tanh": _gelu_tanh, + "silu": _silu, + } + return mapping[activation] + + +@triton.heuristics( + { + "EVEN_M_N": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0 + and args["N"] % (args["BLOCK_SIZE_N"] * args["NUM_ITER"]) == 0, + } +) +@triton.jit +def _act_mul_and_dynamic_mxfp4_quant_kernel( + x_ptr, + x_fp4_ptr, + bs_ptr, + stride_x_m_in, + stride_x_n_in, + stride_x_fp4_m_in, + stride_x_fp4_n_in, + stride_bs_m_in, + stride_bs_n_in, + M, + N, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + NUM_ITER: tl.constexpr, + NUM_STAGES: tl.constexpr, + MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, + EVEN_M_N: tl.constexpr, + SCALING_MODE: tl.constexpr, + ACTIVATION: tl.constexpr, + scaleN: tl.constexpr, + scaleM_pad: tl.constexpr, + scaleN_pad: tl.constexpr, + SHUFFLE: tl.constexpr, +): + pid_m = tl.program_id(0) + start_n = tl.program_id(1) * NUM_ITER + # cast strides to int64, in case M*N > max int32 + stride_x_m = tl.cast(stride_x_m_in, tl.int64) + stride_x_n = tl.cast(stride_x_n_in, tl.int64) + stride_x_fp4_m = tl.cast(stride_x_fp4_m_in, tl.int64) + stride_x_fp4_n = tl.cast(stride_x_fp4_n_in, tl.int64) + stride_bs_m = tl.cast(stride_bs_m_in, tl.int64) + stride_bs_n = tl.cast(stride_bs_n_in, tl.int64) + + NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE_N // MXFP4_QUANT_BLOCK_SIZE + + for pid_n in tl.range(start_n, min(start_n + NUM_ITER, N), num_stages=NUM_STAGES): + x_offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + x_offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + x_offs = x_offs_m[:, None] * stride_x_m + x_offs_n[None, :] * stride_x_n + + if EVEN_M_N: + a = tl.load(x_ptr + x_offs, cache_modifier=".cg").to(tl.float32) + b = tl.load(x_ptr + x_offs + stride_x_n * N, cache_modifier=".cg").to( + tl.float32 + ) + else: + x_mask = (x_offs_m < M)[:, None] & (x_offs_n < N)[None, :] + a = tl.load(x_ptr + x_offs, mask=x_mask, cache_modifier=".cg").to( + tl.float32 + ) + # a and b can share the same mask + b = tl.load( + x_ptr + x_offs + stride_x_n * N, mask=x_mask, cache_modifier=".cg" + ).to(tl.float32) + + x = _get_activation_from_str(ACTIVATION)(a) * b + + out_tensor, bs_e8m0 = _mxfp4_quant_op( + x, BLOCK_SIZE_N, BLOCK_SIZE_M, MXFP4_QUANT_BLOCK_SIZE + ) + + out_offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + out_offs_n = pid_n * BLOCK_SIZE_N // 2 + tl.arange(0, BLOCK_SIZE_N // 2) + out_offs = ( + out_offs_m[:, None] * stride_x_fp4_m + out_offs_n[None, :] * stride_x_fp4_n + ) + + if EVEN_M_N: + tl.store(x_fp4_ptr + out_offs, out_tensor) + else: + out_mask = (out_offs_m < M)[:, None] & (out_offs_n < (N // 2))[None, :] + tl.store(x_fp4_ptr + out_offs, out_tensor, mask=out_mask) + + bs_offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + bs_offs_n = pid_n * NUM_QUANT_BLOCKS + tl.arange(0, NUM_QUANT_BLOCKS) + if SHUFFLE: + bs_offs_0 = bs_offs_m[:, None] // 32 + bs_offs_1 = bs_offs_m[:, None] % 32 + bs_offs_2 = bs_offs_1 % 16 + bs_offs_1 = bs_offs_1 // 16 + bs_offs_3 = bs_offs_n[None, :] // 8 + bs_offs_4 = bs_offs_n[None, :] % 8 + bs_offs_5 = bs_offs_4 % 4 + bs_offs_4 = bs_offs_4 // 4 + bs_offs = ( + bs_offs_1 + + bs_offs_4 * 2 + + bs_offs_2 * 2 * 2 + + bs_offs_5 * 2 * 2 * 16 + + bs_offs_3 * 2 * 2 * 16 * 4 + + bs_offs_0 * 2 * 16 * scaleN + ) + bs_mask1 = (bs_offs_m < M)[:, None] & (bs_offs_n < scaleN)[None, :] + bs_mask = (bs_offs_m < scaleM_pad)[:, None] & (bs_offs_n < scaleN_pad)[ + None, : + ] + bs_e8m0 = tl.where(bs_mask1, bs_e8m0, 127) + else: + bs_offs = ( + bs_offs_m[:, None] * stride_bs_m + bs_offs_n[None, :] * stride_bs_n + ) + bs_mask = (bs_offs_m < M)[:, None] & (bs_offs_n < scaleN)[None, :] + if EVEN_M_N: + tl.store(bs_ptr + bs_offs, bs_e8m0) + else: + + tl.store( + bs_ptr + bs_offs, + bs_e8m0, + mask=bs_mask, + ) + + +def act_mul_and_mxfp4_quant( + x: torch.Tensor, + activation: Literal["silu", "gelu", "gelu_tanh"], + scaling_mode: str = "even", + shuffle: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply the activation function and quantize the result to MX FP4 format. + + Args: + x: The input tensor, typically fp16 or bf16. + activation: activation function to apply before quantization. + - It splits the features into two parts and applies the activation to the first part. + - Then, it adds the results together before quantization. + - Supports the following activations: + - "silu" + - "gelu" + - "gelu_tanh" + + scaling_mode: The method to calculate MX block scaling. + - "even" (default): `even_round` in `quark.torch.quantization.utils`. + - etc. + shuffle: Indicates whether to enable preshuffling of scales. + - When enabled, scale dimensions (X, Y) are adjusted to be multiples of 8 and 256, respectively. + Returns: + A tuple of (x_fp4, blockscale_e8m0). + """ + # Assume x is 2D-Tensor for now + M, N = x.shape + # Activation (N/2) and storing results in uint8 (N/2) results in a feature dimension of N/4 + assert N % 4 == 0 + + # This is fixed by spec for MXFP4. Do not tune this. + MXFP4_QUANT_BLOCK_SIZE = 32 + N_half = N // 2 + x_fp4 = torch.empty((M, N_half // 2), dtype=torch.uint8, device=x.device) + scaleN_valid = triton.cdiv(N_half, MXFP4_QUANT_BLOCK_SIZE) + # Setting scale M to be multiple of 256 and scale N to be multiple of 8 + if shuffle: + scaleM = triton.cdiv(M, 256) * 256 + scaleN = triton.cdiv(scaleN_valid, 8) * 8 + blockscale_e8m0 = torch.empty( + (scaleM, scaleN), + dtype=torch.uint8, + device=x.device, + ) + else: + scaleM = M + scaleN = scaleN_valid + blockscale_e8m0 = torch.empty( + (scaleN, scaleM), + dtype=torch.uint8, + device=x.device, + ).T + + # for large N values + if M <= 32: + NUM_ITER = 1 + BLOCK_SIZE_M = min(8, triton.next_power_of_2(M)) + BLOCK_SIZE_N = 128 + NUM_WARPS = 1 if BLOCK_SIZE_M < 4 else 4 + NUM_STAGES = 1 + else: + NUM_ITER = 1 + BLOCK_SIZE_M = 16 + BLOCK_SIZE_N = 256 + NUM_WARPS = 4 + NUM_STAGES = 1 + + # for small N values + if N_half <= 1024: + NUM_ITER = 1 + NUM_STAGES = 1 + NUM_WARPS = 4 + BLOCK_SIZE_N = min(256, triton.next_power_of_2(N_half)) + # BLOCK_SIZE_N needs to be multiple of 32 + BLOCK_SIZE_N = max(32, BLOCK_SIZE_N) + BLOCK_SIZE_M = min(8, triton.next_power_of_2(N_half)) + + # shuffle requires block sizes to be multiple of 32 + if shuffle: + BLOCK_SIZE_M = triton.cdiv(BLOCK_SIZE_M, 32) * 32 + BLOCK_SIZE_N = triton.cdiv(BLOCK_SIZE_N, 32) * 32 + + grid = ( + triton.cdiv(M, BLOCK_SIZE_M), + triton.cdiv(N_half, BLOCK_SIZE_N * NUM_ITER), + ) + _act_mul_and_dynamic_mxfp4_quant_kernel[grid]( + x, + x_fp4, + blockscale_e8m0, + *x.stride(), + *x_fp4.stride(), + *blockscale_e8m0.stride(), + M=M, + N=N_half, + MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, + SCALING_MODE=0, + ACTIVATION=activation, + scaleN=scaleN_valid, + scaleM_pad=scaleM, + scaleN_pad=scaleN, + SHUFFLE=shuffle, + NUM_ITER=NUM_ITER, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_N=BLOCK_SIZE_N, + NUM_STAGES=NUM_STAGES, + num_warps=NUM_WARPS, + waves_per_eu=0, + num_stages=1, + ) + + return x_fp4, blockscale_e8m0 diff --git a/aiter/ops/triton/attention/fp8_mqa_logits.py b/aiter/ops/triton/attention/fp8_mqa_logits.py new file mode 100644 index 0000000000000000000000000000000000000000..ec3a1219ad47070af845ec22422df9f69e220a8b --- /dev/null +++ b/aiter/ops/triton/attention/fp8_mqa_logits.py @@ -0,0 +1,109 @@ +import torch + +from aiter.ops.triton._triton_kernels.attention.fp8_mqa_logits import ( + _fp8_mqa_logits_kernel, + _fp8_mqa_clear_logits_kernel, +) + + +def fp8_mqa_logits( + Q, + KV, + kv_scales, + weights, + cu_starts, + cu_ends, +): + """ + This function computes the logits to be used by a topk function for sparse attention. + + Q: [seq_len, NUM_HEADS, HEAD_SIZE], dtype float8 + KV: [seq_len_kv, HEAD_SIZE], dtype float8 + kv_scales: [seq_len_kv], dtype float32 + weights: [seq_len, NUM_HEADS], dtype float32 + cu_starts: [seq_len], dtype int32, start indices + cu_ends: [seq_len], dtype int32, end indices + + Returns: + logits: [seq_len, seq_len_kv], dtype float32 (must be initialized to -inf, because of causal masking) + """ + #BLOCK_KV = 128 + BLOCK_KV = 64 + seq_len, num_heads, head_size = Q.shape + seq_len_kv = KV.shape[0] + logits_masking = (seq_len_kv % BLOCK_KV != 0) + # TODO: Currently assuming num_heads and head_size is power of 2. + assert num_heads & (num_heads - 1) == 0, "num q. heads should be power of 2." + assert head_size & (head_size - 1) == 0, "head size should be power of 2." + # Initialize with -inf because of causal masking + if logits_masking: + logits = torch.full( + (seq_len, seq_len_kv), + fill_value=-float("inf"), + dtype=torch.float32, + device=Q.device, + ) + else: + logits = torch.empty( + (seq_len, seq_len_kv), + dtype=torch.float32, + device=Q.device, + ) + + stride_q_s, stride_q_h, stride_q_d = Q.stride() + stride_kv_s, stride_kv_d = KV.stride() + stride_w_s, stride_w_h = weights.stride() + stride_logits_s, stride_logits_k = logits.stride() + + # heuristic for MFMA instruction shape + #matrix_instr_nonkdim = 32 + #if seq_len <= 1024: + # matrix_instr_nonkdim = 16 + matrix_instr_nonkdim = 16 + + _fp8_mqa_logits_kernel[(seq_len,)]( + Q_ptr=Q, + KV_ptr=KV, + kv_scales_ptr=kv_scales, + weights_ptr=weights, + cu_start_ptr=cu_starts, + cu_end_ptr=cu_ends, + logits_ptr=logits, + seq_len=seq_len, + seq_len_kv=seq_len_kv, + NUM_HEADS=num_heads, + HEAD_SIZE=head_size, + stride_q_s=stride_q_s, + stride_q_h=stride_q_h, + stride_q_d=stride_q_d, + stride_kv_s=stride_kv_s, + stride_kv_d=stride_kv_d, + stride_w_s=stride_w_s, + stride_w_h=stride_w_h, + stride_logits_s=stride_logits_s, + stride_logits_k=stride_logits_k, + BLOCK_KV=BLOCK_KV, + num_warps=4, + ##num_stages=2, + num_stages=1, + waves_per_eu=2, + matrix_instr_nonkdim=matrix_instr_nonkdim, + LOGITS_MASKING=logits_masking, + ) + + # Clear per-row invalid KV positions (outside [cu_start, cu_end)) to -inf. + if not logits_masking: + _fp8_mqa_clear_logits_kernel[(seq_len,)]( + logits_ptr=logits, + cu_start_ptr=cu_starts, + cu_end_ptr=cu_ends, + seq_len_kv=seq_len_kv, + stride_logits_s=stride_logits_s, + stride_logits_k=stride_logits_k, + BLOCK_KV=256, + waves_per_eu=2, + num_warps=4, + num_stages=1, + ) + + return logits diff --git a/aiter/ops/triton/attention/pa_mqa_logits.py b/aiter/ops/triton/attention/pa_mqa_logits.py new file mode 100644 index 0000000000000000000000000000000000000000..1e3cab25dc586950aeae43e9162a5e840e9d6f08 --- /dev/null +++ b/aiter/ops/triton/attention/pa_mqa_logits.py @@ -0,0 +1,582 @@ +# ======================================================================== +# How to use AOT gluon kernel for pa_mqa_logits on lower triton version (below 3.4.0): +# 1. Generate Gluon kernel based on rocm/triton/gluon_ext (3.5.0+gite392a058) +# it requires zip installed. +# $ cd ${AOT_DUMP_AITER_ROOT} +# $ python3 op_tests/op_benchmarks/triton/bench_deepgemm_attention.py --batch=1 -aot [-p] +# "-p" means kernel could assume the stride of KVCache is aligned to 16B. +# If enable it, the stride of KVCache in the AOT_load side must also be aligned to 16B. +# 2. Copy generated paged_mqa_logits_aot_kernel.zip to ${AOT_LOAD_AITER_ROOT}/aiter/ops/triton/configs +# and unzip it. +# $ cd ${AOT_LOAD_AITER_ROOT} +# $ cd aiter/ops/triton/configs && unzip paged_mqa_logits_aot_kernel.zip && cd - +# 3. Set env variable to enable AOT gluon kernel loading +# $ export AITER_ENABLE_AOT_GLUON_PA_MQA_LOGITS=1 +# $ python3 op_tests/op_benchmarks/triton/bench_deepgemm_attention.py -kv_length=32768 --batch=2 -mtp=1 -p +# Set AITER_ENABLE_AOT_GLUON_PA_MQA_LOGITS=0 to disable AOT gluon kernel. It will backward +# to triton JIT kernel +# ======================================================================== + +import os +import math +from functools import lru_cache + +import torch +import triton +from packaging.version import Version +from triton.backends.compiler import GPUTarget + +from aiter import dtypes +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +#from aiter.utility.triton.triton_metadata_redirect import AOTMetadataContext + +from aiter.jit.utils.chip_info import get_gfx + +enable_aot_gluon_pa_mqa_logits = os.environ.get( + "AITER_ENABLE_AOT_GLUON_PA_MQA_LOGITS", "0" +) +enable_aot_gluon_pa_mqa_logits = enable_aot_gluon_pa_mqa_logits == "1" +triton_version = Version(Version(triton.__version__).base_version) +if triton_version >= Version("3.5.0"): + from triton.experimental.gluon._runtime import GluonASTSource as ASTSource + + from aiter.ops.triton._triton_kernels.attention.pa_mqa_logits import ( + _deepgemm_fp8_paged_mqa_logits, + _deepgemm_fp8_paged_mqa_logits_varctx_schedule, + _deepgemm_fp8_paged_mqa_logits_ragged_k, + _deepgemm_fp8_paged_mqa_logits_stage1, + _deepgemm_fp8_paged_mqa_logits_stage1_ragged_k, + ) + from aiter.ops.triton.gluon.pa_mqa_logits import ( + _gluon_deepgemm_fp8_paged_mqa_logits, + _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle, + _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx, + ) + + enable_gluon_pa_mqa_logits = True + enable_jit_gluon_pa_mqa_logits_kernel = not enable_aot_gluon_pa_mqa_logits +else: + from triton.compiler import ASTSource + + from aiter.ops.triton._triton_kernels.attention.pa_mqa_logits import ( + _deepgemm_fp8_paged_mqa_logits, + _deepgemm_fp8_paged_mqa_logits_varctx_schedule, + _deepgemm_fp8_paged_mqa_logits_ragged_k, + _deepgemm_fp8_paged_mqa_logits_stage1, + _deepgemm_fp8_paged_mqa_logits_stage1_ragged_k, + _gluon_deepgemm_fp8_paged_mqa_logits, + _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle, + _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx, + ) + + enable_gluon_pa_mqa_logits = enable_aot_gluon_pa_mqa_logits + enable_jit_gluon_pa_mqa_logits_kernel = False + + +def _default_total_cu_count() -> int: + gfx = get_gfx() + if gfx == "gfx942": + return 80 + try: + return torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count + except Exception: + return 256 + + +def deepgemm_fp8_paged_mqa_logits_ragged_k( + q_fp8: torch.Tensor, # dtype = float8 + kv_cache_fp8: torch.Tensor, # dtype = float8 + weights: torch.Tensor, # dtype = float32 + out_logits: torch.Tensor, # dtype = float32 + prefix_sum_context_lens: torch.Tensor, + kv_indices: torch.Tensor, + max_model_len: int, + ChunkK: int = 64, + SplitKV: int = 5, +): + batch_size, next_n, heads, hidden_dim = q_fp8.size() + kv_cache_fp8, kv_cache_scale = ( + kv_cache_fp8[..., :hidden_dim], + kv_cache_fp8[..., hidden_dim:], + ) + # Since triton doesn't have have the reinterpret_cast, we slice the scale out and view it as float + kv_cache_scale = kv_cache_scale.view(torch.float32) + kv_cache_fp8 = kv_cache_fp8.view(dtypes.fp8) + + config = { + "ChunkQ": heads, + "ChunkK": ChunkK, + "HiddenDim": hidden_dim, + "SplitKV": SplitKV, + } + + grid = (batch_size * next_n * config["SplitKV"],) + _deepgemm_fp8_paged_mqa_logits_ragged_k[grid]( + batch_size, + next_n, + heads, + q_fp8, + q_fp8.stride(0), + q_fp8.stride(1), + q_fp8.stride(2), + kv_cache_fp8, + kv_cache_fp8.stride(0), + kv_cache_scale, + kv_cache_scale.stride(0), + prefix_sum_context_lens, + kv_indices, + weights, + weights.stride(0), + out_logits, + out_logits.stride(0), + max_model_len, + **config, + ) + + +def deepgemm_fp8_paged_mqa_logits_stage1_ragged_k( + q_fp8: torch.Tensor, # dtype = float8 + kv_cache_fp8: torch.Tensor, # dtype = float8 + weights: torch.Tensor, # dtype = float32 + out_qk: torch.Tensor, # dtype = float32 + prefix_sum_context_lens: torch.Tensor, + kv_indices: torch.Tensor, + max_model_len: int, +): + batch_size, next_n, heads, hidden_dim = q_fp8.size() + kv_cache_fp8, kv_cache_scale = ( + kv_cache_fp8[..., :hidden_dim], + kv_cache_fp8[..., hidden_dim:], + ) + # Since triton doesn't have the reinterpret_cast, we slice the scale out and view it as float + kv_cache_scale = kv_cache_scale.view(torch.float32) + kv_cache_fp8 = kv_cache_fp8.view(dtypes.fp8) + + config = { + "ChunkQ": 32, + "ChunkK": 64, + "HiddenDim": hidden_dim, + "SplitKV": 5, + } + assert heads % config["ChunkQ"] == 0 + + grid = (batch_size * next_n * (heads // config["ChunkQ"] * config["SplitKV"]),) + _deepgemm_fp8_paged_mqa_logits_stage1_ragged_k[grid]( + batch_size, + next_n, + heads, + q_fp8, + q_fp8.stride(0), + q_fp8.stride(1), + q_fp8.stride(2), + kv_cache_fp8, + kv_cache_fp8.stride(0), + kv_cache_scale, + kv_cache_scale.stride(0), + prefix_sum_context_lens, + kv_indices, + weights, + weights.stride(0), + out_qk, + out_qk.stride(0), + out_qk.stride(1), + max_model_len, + **config, + ) + + +def deepgemm_fp8_paged_mqa_logits_stage1( + q_fp8: torch.Tensor, # dtype = float8 + kv_cache_fp8: torch.Tensor, # dtype = float8 [num_blocks, 1, 1, D+4] + weights: torch.Tensor, # dtype = float32 + out_qk: torch.Tensor, # dtype = float32 + context_lens: torch.Tensor, + kv_indices: torch.Tensor, + max_model_len: int, + ChunkQ: int = 64, + ChunkK: int = 256, + TotalCuCount: int = None, + WavePerEU: int = 2, +): + if TotalCuCount is None: + TotalCuCount = _default_total_cu_count() + + batch_size, next_n, heads, hidden_dim = q_fp8.size() + _, max_blk_len = kv_indices.size() + + TileQCount = batch_size * next_n * (heads // ChunkQ) + SplitKV = (max(1, TotalCuCount // TileQCount) + 4) // 5 * 5 * WavePerEU + + kv_cache_fp8, kv_cache_scale = ( + kv_cache_fp8[..., :hidden_dim], + kv_cache_fp8[..., hidden_dim:], + ) + # Since triton doesn't have the reinterpret_cast, we slice the scale out and view it as float + kv_cache_scale = kv_cache_scale.view(torch.float32) + kv_cache_fp8 = kv_cache_fp8.view(dtypes.fp8) + + config = { + "ChunkQ": ChunkQ, + "ChunkK": ChunkK, + "HiddenDim": hidden_dim, + "SplitKV": SplitKV, + } + assert heads % config["ChunkQ"] == 0 + + grid = (batch_size * next_n * (heads // config["ChunkQ"] * SplitKV),) + _deepgemm_fp8_paged_mqa_logits_stage1[grid]( + batch_size, + next_n, + heads, + q_fp8, + q_fp8.stride(0), + q_fp8.stride(1), + q_fp8.stride(2), + kv_cache_fp8, + kv_cache_fp8.stride(0), + kv_cache_scale, + kv_cache_scale.stride(0), + context_lens, + kv_indices, + weights, + weights.stride(0), + out_qk, + out_qk.stride(0), + out_qk.stride(1), + max_model_len, + max_blk_len, + waves_per_eu=WavePerEU, + **config, + ) + + +@lru_cache(maxsize=None) +def _compile_deepgemm_fp8_paged_mqa_logits( + ChunkQ, + ChunkK, + Preshuffle, + KVBlockSize, + HiddenDim, + is_padded_mode: bool, + WavePerEU: int = 2, + VarCtxOpt: bool = False, +): + gfx_version = get_gfx() + assert gfx_version == "gfx942" or gfx_version == "gfx950" + target = GPUTarget("hip", gfx_version, 64) + + gfx_fp8_pointer = "*fp8e4b8" if gfx_version == "gfx942" else "*fp8e4nv" + + fn_signature = { + "batch_size": "i32", + "next_n": "i32", + "heads_num": "i32", + "Q_buffer": gfx_fp8_pointer, + "stride_q_batch": "i32", + "stride_q_next_n": "i32", + "stride_q_heads": "i32", + "KV_buffer": gfx_fp8_pointer, + "stride_k_seq": "i32", + "scale_buffer": "*fp32", + "stride_scale_seq": "i32", + "context_len_ptr": "*i32", + "kv_indices": "*i32", + "weights": "*fp32", + "stride_w_batch": "i32", + "OutLogits_buffer": "*fp32", + "stride_out_batch": "i32", + "max_model_len": "i32", + "max_block_len": "i32", + } + if VarCtxOpt: + fn_signature["safe_chunks_per_cta_ptr"] = "*i32" + else: + fn_signature["SplitKV"] = "i32" + + if triton_version < Version("3.4.0"): + assert not enable_jit_gluon_pa_mqa_logits_kernel + fn_signature["dummyPointerArg"] = "*i32" + fn_signature["ChunkQ"] = "constexpr" + fn_signature["ChunkK"] = "constexpr" + fn_signature["KVBlockSize"] = "constexpr" + fn_signature["HiddenDim"] = "constexpr" + + options = { + "num_warps": 4, + "waves_per_eu": WavePerEU, + "num_stages": 2, + "num_ctas": 1, + "cluster_dims": [1, 1, 1], + "arch": gfx_version, + "backend_name": "hip", + "warp_size": 64, + "name": ( + "_gluon_deepgemm_fp8_paged_mqa_logits" + if not Preshuffle + else ( + "_gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx" + if VarCtxOpt + else "_gluon_deepgemm_fp8_paged_mqa_logits_preshuffle" + ) + ), + } + + kv_cache_attr = [] + if is_padded_mode: + kv_cache_attr.append(["tt.divisibility", 16]) + + kernel_fn = ( + _gluon_deepgemm_fp8_paged_mqa_logits + if not Preshuffle + else ( + _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx + if VarCtxOpt + else _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle + ) + ) + src = ASTSource( + fn=kernel_fn, + signature=fn_signature, + constexprs={ + "ChunkQ": ChunkQ, + "ChunkK": ChunkK, + "KVBlockSize": KVBlockSize, + "HiddenDim": HiddenDim, + }, + attrs={ + (2,): [["tt.divisibility", 16]], # heads_num + (3,): [["tt.divisibility", 16], ["tt.pointer_range", 32]], # Q_buffer + (4,): [["tt.divisibility", 16]], # stride_q_batch + (5,): [["tt.divisibility", 16]], # stride_q_next_n + (6,): [["tt.divisibility", 16]], # stride_q_heads + (7,): kv_cache_attr, # KV_buffer + (8,): kv_cache_attr, # stride_k_seq + (9,): kv_cache_attr, # scale_buffer + (10,): kv_cache_attr, # stride_scale_seq + (11,): [["tt.pointer_range", 32]], # context_len_ptr + (12,): [["tt.pointer_range", 32]], # kv_indices + (13,): [ + ["tt.divisibility", 16], + ["tt.pointer_range", 32], + ], # weights + (14,): [["tt.divisibility", 16]], # stride_w_batch + (15,): [["tt.pointer_range", 32]], # OutLogits_buffer + }, + ) + + if enable_jit_gluon_pa_mqa_logits_kernel: + kernel = triton.compile( + src, + target=target, + options=options, + ) + else: + padded_str = "T" if is_padded_mode and not Preshuffle else "F" + preshuffle_suffix = "_preshuffle" if Preshuffle else "" + varctx_suffix = "_varctx" if VarCtxOpt else "" + kernel_str = f"paged_mqa_logits{preshuffle_suffix}{varctx_suffix}_{ChunkQ}x{ChunkK}x{HiddenDim}_B{KVBlockSize}P{padded_str}W{WavePerEU}" + metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/paged_mqa_logits/aot/{kernel_str}" + # with AOTMetadataContext( + # kernel_fn.fn.__name__, + # metadata_pth, + # ): + # kernel = triton.compile( + # src, + # target=target, + # options=options, + # ) + return kernel + + +def deepgemm_fp8_paged_mqa_logits_schedule( + batch_size, + next_n, + context_lens: torch.Tensor, + max_model_len: int, + ChunkK: int = 256, + TotalCuCount: int = None, + WavePerEU: int = 2, +): + if TotalCuCount is None: + TotalCuCount = _default_total_cu_count() + + assert batch_size < TotalCuCount * WavePerEU // next_n + + max_chunks = math.ceil(max_model_len / ChunkK) + schedule_waves_per_eu = 4 + grid = (TotalCuCount * schedule_waves_per_eu, 1, 1) + TryCount = math.ceil(max_chunks / grid[0]) + align_power_of_2_batch = 1 << (batch_size - 1).bit_length() + + safe_chunks_per_cta = torch.empty( + (1,), + device="cuda", + dtype=torch.int32, + ) + _deepgemm_fp8_paged_mqa_logits_varctx_schedule[grid]( + batch_size, + context_lens, + safe_chunks_per_cta, + TotalCuCount * WavePerEU // next_n, + ChunkK, + align_power_of_2_batch, + TryCount, + waves_per_eu=schedule_waves_per_eu, + ) + return safe_chunks_per_cta + + +def deepgemm_fp8_paged_mqa_logits( + q_fp8: torch.Tensor, # dtype = float8 + kv_cache, + weights: torch.Tensor, # dtype = float32 + out_logits: torch.Tensor, # dtype = float32 + context_lens: torch.Tensor, + kv_indices: torch.Tensor, + max_model_len: int, + Preshuffle: bool = False, + KVBlockSize: int = 1, + ChunkK: int = 256, + TotalCuCount: int = None, + WavePerEU: int = 2, + VarCtxSchedule: torch.Tensor = None, +): + if TotalCuCount is None: + TotalCuCount = _default_total_cu_count() + if q_fp8.dtype != dtypes.fp8: + q_fp8 = q_fp8.to(dtypes.fp8) + + batch_size, next_n, heads, hidden_dim = q_fp8.size() + num_block, block_Size, _, index_dim = kv_cache.size() + _, max_block_len = kv_indices.size() + + TileQCount = batch_size * next_n + SplitKV = (max(1, TotalCuCount // TileQCount) + 4) // 5 * 5 * WavePerEU + + assert ChunkK % KVBlockSize == 0 or KVBlockSize % ChunkK == 0 + assert block_Size == KVBlockSize + if Preshuffle: + assert ( + KVBlockSize % 16 == 0 + ), f"Preshuffle mode only supports KVBlockSize aligned to 16. Got KVBlockSize={KVBlockSize}" + + kv_cache = kv_cache.view(-1, KVBlockSize * index_dim) + kv_cache_fp8, kv_cache_scale = ( + kv_cache[..., : KVBlockSize * hidden_dim], + kv_cache[..., KVBlockSize * hidden_dim :], + ) + kv_cache_fp8 = kv_cache_fp8.view(dtypes.fp8) + kv_cache_scale = kv_cache_scale.view(torch.float32) + + VarCtxOpt = VarCtxSchedule is not None + if VarCtxOpt: + grid = (TotalCuCount * WavePerEU, 1, 1) + else: + grid = (batch_size * next_n * SplitKV, 1, 1) + + if enable_gluon_pa_mqa_logits: + is_padded_mode = kv_cache_fp8.stride(0) % 16 == 0 + kernel = _compile_deepgemm_fp8_paged_mqa_logits( + ChunkQ=heads, + ChunkK=ChunkK, + Preshuffle=Preshuffle, + KVBlockSize=KVBlockSize, + HiddenDim=hidden_dim, + is_padded_mode=is_padded_mode, + WavePerEU=WavePerEU, + VarCtxOpt=VarCtxOpt, + ) + if triton_version >= Version("3.5.0"): + kernel[grid]( + batch_size, + next_n, + heads, + q_fp8, + q_fp8.stride(0), + q_fp8.stride(1), + q_fp8.stride(2), + kv_cache_fp8, + kv_cache_fp8.stride(0), + kv_cache_scale, + kv_cache_scale.stride(0), + context_lens, + kv_indices, + weights, + weights.stride(0), + out_logits, + out_logits.stride(0), + max_model_len, + max_block_len, + SplitKV if not VarCtxOpt else VarCtxSchedule, + # constexpr + heads, + ChunkK, + KVBlockSize, + hidden_dim, + ) + else: # load AOT compiled gluon kernel + assert triton_version < Version( + "3.4.0" + ), "https://github.com/triton-lang/triton/pull/7258 involves a ABI-breaking change on triton3.4, " + "which adding an extra pointer argument at the end of kernel arguments. To ensure compatibility" + "with AOT compiled gluon kernel on triton3.5, a feasible solution is to add a pointer parameter " + "at the end of the parameters and ensure that the Triton version used is before the ABI " + "modification, i.e., verison<3.4.0" + kernel[grid]( + batch_size, + next_n, + heads, + q_fp8, + q_fp8.stride(0), + q_fp8.stride(1), + q_fp8.stride(2), + kv_cache_fp8, + kv_cache_fp8.stride(0), + kv_cache_scale, + kv_cache_scale.stride(0), + context_lens, + kv_indices, + weights, + weights.stride(0), + out_logits, + out_logits.stride(0), + max_model_len, + max_block_len, + SplitKV if not VarCtxOpt else VarCtxSchedule, + out_logits, # dummyPointerArg for triton version < 3.4.0, + # constexpr + heads, + ChunkK, + KVBlockSize, + hidden_dim, + ) + else: + assert KVBlockSize == 1 + assert not Preshuffle, "Preshuffle mode is only supported on gluon kernel." + kernel = _deepgemm_fp8_paged_mqa_logits[grid]( + batch_size, + next_n, + heads, + q_fp8, + q_fp8.stride(0), + q_fp8.stride(1), + q_fp8.stride(2), + kv_cache_fp8, + kv_cache_fp8.stride(0), + kv_cache_scale, + kv_cache_scale.stride(0), + context_lens, + kv_indices, + weights, + weights.stride(0), + out_logits, + out_logits.stride(0), + max_model_len, + max_block_len, + waves_per_eu=WavePerEU, + ChunkQ=heads, + ChunkK=ChunkK, + SplitKV=SplitKV, + HiddenDim=hidden_dim, + ) + return triton.runtime.cache.get_cache_manager(kernel.hash).key diff --git a/aiter/ops/triton/batched_gemm_a8w8.py b/aiter/ops/triton/batched_gemm_a8w8.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf87beb3ebdd1ccccc7b70e6f45a4769c8204fe --- /dev/null +++ b/aiter/ops/triton/batched_gemm_a8w8.py @@ -0,0 +1,295 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _batched_gemm_a8w8_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + bias_ptr, + # Matrix dimensions + M, + N, + K, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ab, + stride_am, + stride_ak, + stride_bb, + stride_bk, + stride_bn, + stride_cb, + stride_cm, + stride_cn, + stride_ascaleb, + stride_bscaleb, + stride_biasb, + # Meta-parameters + HAS_BIAS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call batched_gemm_a8w8 function + below + + Computes the matmul C[i] = A[i] x B[i] and applies a conversion scale for every i in a given batch. + Optionally, adds a bias to each result. + + The conversion scale for each matmul is received in the form of two 1D tensors that are multiplied to form a + 2D one before being applied. + + Key parameters: + - A: Batch tensor A with shape (B, M, K). + - B: Batch tensor B with shape (B, K, N). + - C: Batch tensor C with shape (B, M, N). + - A_scale: First scale batch tensor with shape (B, M, 1). + - B_scale: Second scale batch tensor with shape (B, 1, N). + - Bias: Bias batch tensor with shape (B, 1, N). + """ + + tl.assume(stride_ab > 0) + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bb > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cb > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_ascaleb > 0) + tl.assume(stride_bscaleb > 0) + tl.assume(stride_biasb > 0) + + # ----------------------------------------------------------- + # Get batch program id + batch_id = tl.program_id(axis=0) + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=1) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if GROUP_SIZE_M == 1: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + else: + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + # Cast batch id and batch dimension strides to int64 to avoid int32 overflow during offset calculation + # Note: If you're attempting to cast strides to int64 to prevent integer overflow, use `tl.cast` instead of `.to()`. + # See https://github.com/ROCm/aiter/pull/597 for rationale + batch_id = tl.cast(batch_id, tl.int64) + stride_ab = tl.cast(stride_ab, tl.int64) + stride_bb = tl.cast(stride_bb, tl.int64) + stride_cb = tl.cast(stride_cb, tl.int64) + + # Create pointers for first block of A and B input matrices + offs_k = tl.arange(0, BLOCK_SIZE_K) + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + ( + batch_id * stride_ab + + offs_am[:, None] * stride_am + + offs_k[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + batch_id * stride_bb + + offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + + # Create pointers for the scale tensors and load them + offs_a_scale = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) % M + offs_b_scale = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) % N + a_scale = tl.load(a_scale_ptr + batch_id * stride_ascaleb + offs_a_scale) + b_scale = tl.load(b_scale_ptr + batch_id * stride_bscaleb + offs_b_scale) + + acc_dtype = tl.float32 if c_ptr.type.element_ty != tl.int8 else tl.int32 + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs) + else: + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + + accumulator += tl.dot(a, b, input_precision="ieee") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + # Apply scale + accumulator *= a_scale[:, None] * b_scale[None, :] + + # Add bias + if HAS_BIAS: + offs_bias = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + bias = tl.load(bias_ptr + batch_id * stride_biasb + offs_bias) + accumulator = accumulator.to(bias_ptr.type.element_ty) + bias[None, :] + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = ( + c_ptr + + stride_cb * batch_id + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + tl.store(c_ptrs, c, mask=c_mask) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-BATCHED_GEMM-A8W8.json" + print(f"fpath={fpath}") + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + if M + N >= 4096: + return _get_config._config_dict["large"] + else: + return _get_config._config_dict["small"] + + +def batched_gemm_a8w8( + XQ: torch.Tensor, + WQ: torch.Tensor, + x_scale: torch.Tensor, + w_scale: torch.Tensor, + bias: Optional[torch.Tensor] = None, + dtype: Optional[torch.dtype] = torch.bfloat16, + splitK: Optional[int] = None, + YQ: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul YQ[i] = XQ[i] x WQ[i]T and applies a conversion scale for every i in a given batch. + Optionally, adds a bias to each result. + + The conversion scale for each matmul is received in the form of two 1D tensors that are multiplied to form a + 2D one before being applied. + + Key parameters: + - XQ: Batch tensor XQ with shape (B, M, K). + - WQ: Batch tensor WQ with shape (B, N, K). + - X_scale: First scale batch tensor with shape (B, M, 1). + - W_scale: Second scale batch tensor with shape (B, 1, N). + - Bias: Bias batch tensor with shape (B, 1, N). + - YQ: Output Matrix Y with shape (B, M, N). If this is none, then it's created by this API and returned as output + + Returns: + - YQ: The output batch tensor with shape (B, M, N). + """ + + # Make sure XQ and WQ are contiguous in memory + XQ = XQ.contiguous() + WQ = WQ.contiguous() + + # Check constraints. + assert XQ.shape[0] == WQ.shape[0], "Incompatible Batch dimensions!!!" + assert XQ.shape[2] == WQ.shape[2], "Incompatible K dimensions!!!" + assert dtype in [ + torch.bfloat16, + torch.float16, + ], f"Output {dtype=} is currently not supported in batched_gemm_a8w8" + assert splitK == None, "Currently, there isn't any support for splitK on Triton" + + # Transpose N and K dimensions of WQ: (B, N, K) -> (B, K, N) + WQ = WQ.transpose(1, 2) + + B = XQ.shape[0] + M = XQ.shape[1] + K = XQ.shape[2] + N = WQ.shape[2] + + has_bias = bias is not None + if YQ is None: + YQ = torch.empty((B, M, N), dtype=dtype, device=XQ.device) + + if config is None: + config = _get_config(M, N, K) + + grid = lambda META: ( # noqa: E731 + B, + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + + _batched_gemm_a8w8_kernel[grid]( + XQ, + WQ, + YQ, + x_scale, + w_scale, + bias, + M, + N, + K, + XQ.stride(0), + XQ.stride(1), + XQ.stride(2), + WQ.stride(0), + WQ.stride(1), + WQ.stride(2), + YQ.stride(0), + YQ.stride(1), + YQ.stride(2), + x_scale.stride(0), + w_scale.stride(0), + bias.stride(0) if has_bias else 0, + has_bias, + **config, + ) + + return YQ diff --git a/aiter/ops/triton/batched_gemm_afp4wfp4.py b/aiter/ops/triton/batched_gemm_afp4wfp4.py new file mode 100644 index 0000000000000000000000000000000000000000..32e826f038bc983a0d026c86db2d50dde758849b --- /dev/null +++ b/aiter/ops/triton/batched_gemm_afp4wfp4.py @@ -0,0 +1,486 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import os +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +global _USE_GEMM_SPLITK_BF16 +_USE_GEMM_SPLITK_BF16 = False + + +def set_use_gemm_splitk_bf16(value: bool): + global _USE_GEMM_SPLITK_BF16 + _USE_GEMM_SPLITK_BF16 = value + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _batched_gemm_afp4_wfp4_kernel( + a_ptr, + b_ptr, + c_ptr, + a_scales_ptr, + b_scales_ptr, + M, + N, + K, + stride_in_ab, + stride_in_am, + stride_in_ak, + stride_in_bb, + stride_in_bk, + stride_in_bn, + stride_in_cb, + stride_in_ck, + stride_in_cm, + stride_in_cn, + stride_in_asb, + stride_in_asm, + stride_in_ask, + stride_in_bsb, + stride_in_bsn, + stride_in_bsk, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, + cache_modifier: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A and B inputs are in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_in_ab > 0) + tl.assume(stride_in_am > 0) + tl.assume(stride_in_ak > 0) + tl.assume(stride_in_bb > 0) + tl.assume(stride_in_bk > 0) + tl.assume(stride_in_bn > 0) + tl.assume(stride_in_cb > 0) + tl.assume(stride_in_cm > 0) + tl.assume(stride_in_cn > 0) + tl.assume(stride_in_asb > 0) + tl.assume(stride_in_asm > 0) + tl.assume(stride_in_ask > 0) + tl.assume(stride_in_bsb > 0) + tl.assume(stride_in_bsk > 0) + tl.assume(stride_in_bsn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_batch = tl.program_id(axis=0) + pid_unified = tl.program_id(axis=1) + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + # Cast batch id and batch dimension strides to int64 to avoid int32 overflow during offset calculation + # Note: If you're attempting to cast strides to int64 to prevent integer overflow, use `tl.cast` instead of `.to()`. + # See https://github.com/ROCm/aiter/pull/597 for rationale + # stride_ab = tl.cast(stride_ab, tl.int64) + # stride_bb = tl.cast(stride_bb, tl.int64) + # stride_cb = tl.cast(stride_cb, tl.int64) + # pid_batch = tl.cast(pid_batch, tl.int64) + + stride_ab = tl.cast(stride_in_ab, tl.int64) + stride_am = tl.cast(stride_in_am, tl.int64) + stride_ak = tl.cast(stride_in_ak, tl.int64) + stride_bb = tl.cast(stride_in_bb, tl.int64) + stride_bk = tl.cast(stride_in_bk, tl.int64) + stride_bn = tl.cast(stride_in_bn, tl.int64) + stride_cb = tl.cast(stride_in_cb, tl.int64) + stride_ck = tl.cast(stride_in_ck, tl.int64) + stride_cm = tl.cast(stride_in_cm, tl.int64) + stride_cn = tl.cast(stride_in_cn, tl.int64) + stride_asb = tl.cast(stride_in_asb, tl.int64) + stride_asm = tl.cast(stride_in_asm, tl.int64) + stride_ask = tl.cast(stride_in_ask, tl.int64) + stride_bsb = tl.cast(stride_in_bsb, tl.int64) + stride_bsk = tl.cast(stride_in_bsk, tl.int64) + stride_bsn = tl.cast(stride_in_bsn, tl.int64) + + if NUM_KSPLIT == 1: + remap_xcd(pid, GRID_MN) + + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_batch >= 0) + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + # Create pointers for first block of A and B input matrices + # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. + offs_k = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + ( + pid_batch * stride_ab + + offs_am[:, None] * stride_am + + offs_k_split[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + pid_batch * stride_bb + + offs_k_split[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + # Create pointers for the first block of A and B scales + offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE)) + tl.arange( + 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) + a_scale_ptrs = ( + a_scales_ptr + + pid_batch * stride_asb + + offs_am[:, None] * stride_asm + + offs_ks[None, :] * stride_ask + ) + # B scales are N x K even though B operand is K x N. + b_scale_ptrs = ( + b_scales_ptr + + pid_batch * stride_bsb + + offs_bn[:, None] * stride_bsn + + offs_ks[None, :] * stride_bsk + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + a_scales = tl.load(a_scale_ptrs) + b_scales = tl.load(b_scale_ptrs) + # a_scales = tl.full((BLOCK_SIZE_M, BLOCK_SIZE_K//SCALE_GROUP_SIZE), 127, dtype=tl.uint8) + # b_scales = tl.full((BLOCK_SIZE_N, BLOCK_SIZE_K//SCALE_GROUP_SIZE), 127, dtype=tl.uint8) + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + b = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + + accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") + + # Advance the ptrs to the next K block. + a_ptrs += (BLOCK_SIZE_K // 2) * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + a_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_ask + b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_ptrs = ( + c_ptr + + pid_batch * stride_cb + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@triton.jit +def _batched_gemm_afp4_wfp4_reduce_kernel( + c_in_ptr, + c_out_ptr, + M, + N, + stride_c_in_b, + stride_c_in_k, + stride_c_in_m, + stride_c_in_n, + stride_c_out_b, + stride_c_out_m, + stride_c_out_n, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + ACTUAL_KSPLIT: tl.constexpr, + MAX_KSPLIT: tl.constexpr, +): + pid_batch = tl.program_id(axis=0) + pid_m = tl.program_id(axis=1) + pid_n = tl.program_id(axis=2) + + offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, MAX_KSPLIT) + c_in_ptrs = ( + c_in_ptr + + pid_batch * stride_c_in_b + + (offs_k[:, None, None] * stride_c_in_k) + + (offs_m[None, :, None] * stride_c_in_m) + + (offs_n[None, None, :] * stride_c_in_n) + ) + + if ACTUAL_KSPLIT == MAX_KSPLIT: + c = tl.load(c_in_ptrs) + else: + c = tl.load(c_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT) + c = tl.sum(c, axis=0) + + c = c.to(c_out_ptr.type.element_ty) + + c_out_ptrs = ( + c_out_ptr + + pid_batch * stride_c_out_b + + (offs_m[:, None] * stride_c_out_m) + + (offs_n[None, :] * stride_c_out_n) + ) + + tl.store(c_out_ptrs, c) + + +def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): + # heuristics for make "EVEN_K == True" as much as possible + NUM_KSPLIT_STEP = 2 + BLOCK_SIZE_K_STEP = 2 + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + while NUM_KSPLIT > 1 and BLOCK_SIZE_K > 16: + # print(K, SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT) + # print(K % (SPLITK_BLOCK_SIZE // 2) == 0, SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0, K % (BLOCK_SIZE_K // 2) == 0) + + if ( + K % (SPLITK_BLOCK_SIZE // 2) == 0 + and SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0 + and K % (BLOCK_SIZE_K // 2) == 0 + ): + break + elif K % (SPLITK_BLOCK_SIZE // 2) != 0 and NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif SPLITK_BLOCK_SIZE % BLOCK_SIZE_K != 0: + if NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + elif K % (BLOCK_SIZE_K // 2) != 0 and BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + else: + break + + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + + return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-BATCHED_GEMM-AFP4WFP4.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + key = f"{N}_{K}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-BATCHED_GEMM-AFP4WFP4-N={N}-K={2*K}.json" + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = "default" # fall back to default config + if M < 32: + config = _get_config._config_dict[key]["small"] + elif M <= 128: + BLK_M = triton.next_power_of_2(M) + if BLK_M == 32: + config = _get_config._config_dict[key]["medium_M32"] + elif BLK_M == 64: + config = _get_config._config_dict[key]["medium_M64"] + elif BLK_M == 128: + config = _get_config._config_dict[key]["medium_M128"] + elif M <= 256: + config = _get_config._config_dict[key]["large"] + else: + config = _get_config._config_dict[key]["xlarge"] + + config = config.copy() # Avoid modifying the original config + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + + return config + + +def batched_gemm_afp4wfp4( + x, + w, + x_scales, + w_scales, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul Y = X x W + X and W are e2m1 fp4 tensors. + x_scales and w_scales are e8m0 tensors. + Every 32 elements in the K dimension share one e8m0 scale. + + + Key parameters: + - X: Matrix X with shape (B, M, K). + - W: Matrix W with shape (B, N, K). + - X_scales: Matrix with shape (B, M, K // 32) + - W_scales: Matrix with shape (B, N, K // 32) + + Returns: + - Y: The output matrix with shape (B, M, N). + """ + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + w = w.transpose(1, 2) + Bx, M, K = x.shape + Bw, K, N = w.shape + By, _, _ = y.shape + assert Bx == Bw == By + Batch = Bx + + if config is None: + config = _get_config(M, N, K) + + if config["NUM_KSPLIT"] > 1: + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (Batch, config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (Batch, config["NUM_KSPLIT"], M, N), + dtype=torch.float32, + device=y.device, + ) + else: + y_pp = None + + grid = lambda META: ( # noqa: E731 + Batch, + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + _batched_gemm_afp4_wfp4_kernel[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + x_scales, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + x.stride(2), + w.stride(0), + w.stride(1), + w.stride(2), + y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), + 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), + y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), + y.stride(2) if config["NUM_KSPLIT"] == 1 else y_pp.stride(3), + x_scales.stride(0), + x_scales.stride(1), + x_scales.stride(2), + w_scales.stride(0), + w_scales.stride(1), + w_scales.stride(2), + **config, + ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + Batch, + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _batched_gemm_afp4_wfp4_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y_pp.stride(3), + y.stride(0), + y.stride(1), + y.stride(2), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + config["NUM_KSPLIT"], + ) diff --git a/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py new file mode 100644 index 0000000000000000000000000000000000000000..de7637d5af79e1f3b832ddf302d34aa163693a5f --- /dev/null +++ b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py @@ -0,0 +1,444 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import os +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter.ops.triton.quant import _mxfp4_quant_op + +global _USE_GEMM_SPLITK_BF16 +_USE_GEMM_SPLITK_BF16 = False + + +def set_use_gemm_splitk_bf16(value: bool): + global _USE_GEMM_SPLITK_BF16 + _USE_GEMM_SPLITK_BF16 = value + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _batched_gemm_afp4_wfp4_pre_quant_kernel( + a_ptr, + b_ptr, + c_ptr, + b_scales_ptr, + M, + N, + K, + stride_ab, + stride_am, + stride_ak, + stride_bb, + stride_bk, + stride_bn, + stride_cb, + stride_ck, + stride_cm, + stride_cn, + stride_bsb, + stride_bsn, + stride_bsk, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, + cache_modifier: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A and B inputs are in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_ab > 0) + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bb > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cb > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_bsb > 0) + tl.assume(stride_bsk > 0) + tl.assume(stride_bsn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_batch = tl.program_id(axis=0) + pid_unified = tl.program_id(axis=1) + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + # Cast batch id and batch dimension strides to int64 to avoid int32 overflow during offset calculation + # Note: If you're attempting to cast strides to int64 to prevent integer overflow, use `tl.cast` instead of `.to()`. + # See https://github.com/ROCm/aiter/pull/597 for rationale + stride_ab = tl.cast(stride_ab, tl.int64) + stride_bb = tl.cast(stride_bb, tl.int64) + stride_cb = tl.cast(stride_cb, tl.int64) + pid_batch = tl.cast(pid_batch, tl.int64) + + if NUM_KSPLIT == 1: + remap_xcd(pid, GRID_MN) + + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_batch >= 0) + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + # Create pointers for first block of A and B input matrices + # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. + offs_k_bf16 = tl.arange(0, BLOCK_SIZE_K) + offs_k_split_bf16 = pid_k * SPLITK_BLOCK_SIZE + offs_k_bf16 + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + a_ptrs = a_ptr + ( + pid_batch * stride_ab + + offs_am[:, None] * stride_am + + offs_k_split_bf16[None, :] * stride_ak + ) + + offs_k = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + b_ptrs = b_ptr + ( + pid_batch * stride_bb + + offs_k_split[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + # Create pointers for the first block of A and B scales + offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE)) + tl.arange( + 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) + # B scales are N x K even though B operand is K x N. + b_scale_ptrs = ( + b_scales_ptr + + pid_batch * stride_bsb + + offs_bn[:, None] * stride_bsn + + offs_ks[None, :] * stride_bsk + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + b_scales = tl.load(b_scale_ptrs) + # a_scales = tl.full((BLOCK_SIZE_M, BLOCK_SIZE_K//SCALE_GROUP_SIZE), 127, dtype=tl.uint8) + # b_scales = tl.full((BLOCK_SIZE_N, BLOCK_SIZE_K//SCALE_GROUP_SIZE), 127, dtype=tl.uint8) + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a_bf16 = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a_bf16 = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0 + ) + b = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + + a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) + + accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_ptrs = ( + c_ptr + + pid_batch * stride_cb + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@triton.jit +def _batched_gemm_afp4_wfp4_pre_quant_reduce_kernel( + c_in_ptr, + c_out_ptr, + M, + N, + stride_c_in_b, + stride_c_in_k, + stride_c_in_m, + stride_c_in_n, + stride_c_out_b, + stride_c_out_m, + stride_c_out_n, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + ACTUAL_KSPLIT: tl.constexpr, + MAX_KSPLIT: tl.constexpr, +): + pid_batch = tl.program_id(axis=0) + pid_m = tl.program_id(axis=1) + pid_n = tl.program_id(axis=2) + + offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, MAX_KSPLIT) + c_in_ptrs = ( + c_in_ptr + + pid_batch * stride_c_in_b + + (offs_k[:, None, None] * stride_c_in_k) + + (offs_m[None, :, None] * stride_c_in_m) + + (offs_n[None, None, :] * stride_c_in_n) + ) + + if ACTUAL_KSPLIT == MAX_KSPLIT: + c = tl.load(c_in_ptrs) + else: + c = tl.load(c_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT) + c = tl.sum(c, axis=0) + + c = c.to(c_out_ptr.type.element_ty) + + c_out_ptrs = ( + c_out_ptr + + pid_batch * stride_c_out_b + + (offs_m[:, None] * stride_c_out_m) + + (offs_n[None, :] * stride_c_out_n) + ) + + tl.store(c_out_ptrs, c) + + +def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): + # heuristics for make "EVEN_K == True" as much as possible + NUM_KSPLIT_STEP = 2 + BLOCK_SIZE_K_STEP = 2 + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + while NUM_KSPLIT > 1 and BLOCK_SIZE_K > 16: + if ( + K % (SPLITK_BLOCK_SIZE // 2) == 0 + and SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0 + and K % (BLOCK_SIZE_K // 2) == 0 + ): + break + elif K % (SPLITK_BLOCK_SIZE // 2) != 0 and NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif SPLITK_BLOCK_SIZE % BLOCK_SIZE_K != 0: + if NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + elif K % (BLOCK_SIZE_K // 2) != 0 and BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + else: + break + + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + + return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-BATCHED_GEMM_PREQUANT-AFP4WFP4.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + key = f"{N}_{K}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-BATCHED_GEMM_PREQUANT-AFP4WFP4-N={N}-K={2*K}.json" + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = "default" # fall back to default config + + if M < 32: + return _get_config._config_dict[key]["small"] + elif M <= 128: + BLK_M = triton.next_power_of_2(M) + if BLK_M == 32: + return _get_config._config_dict[key]["medium_M32"] + elif BLK_M == 64: + return _get_config._config_dict[key]["medium_M64"] + elif BLK_M == 128: + return _get_config._config_dict[key]["medium_M128"] + elif M <= 256: + return _get_config._config_dict[key]["large"] + else: + return _get_config._config_dict[key]["xlarge"] + + +def batched_gemm_afp4wfp4_pre_quant( + x, + w, + w_scales, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul Y = X x W + W is an e2m1 fp4 tensor and w_scales is an e8m0 tensor. + Every 32 elements in the K dimension share one e8m0 scale. + X gets quantized to the microscale fp4 (mxfp4) format before the GEMM. + + Key parameters: + - X: Matrix X with shape (B, M, K). + - W: Matrix W with shape (B, N, K). + - X_scales: Matrix with shape (B, M, K // 32) + - W_scales: Matrix with shape (B, N, K // 32) + + Returns: + - Y: The output matrix with shape (M, N). + """ + + Bx, M, K = x.shape + Bw, N, K = w.shape + By, _, _ = y.shape + assert Bx == Bw == By + Batch = Bx + w = w.transpose(1, 2) + + if config is None: + config = _get_config(M, N, K) + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (Batch, config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (Batch, config["NUM_KSPLIT"], M, N), + dtype=torch.float32, + device=y.device, + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + + grid = lambda META: ( # noqa: E731 + Batch, + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + _batched_gemm_afp4_wfp4_pre_quant_kernel[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + x.stride(2), + w.stride(0), + w.stride(1), + w.stride(2), + y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), + 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), + y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), + y.stride(2) if config["NUM_KSPLIT"] == 1 else y_pp.stride(3), + w_scales.stride(0), + w_scales.stride(1), + w_scales.stride(2), + **config, + ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + Batch, + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _batched_gemm_afp4_wfp4_pre_quant_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y_pp.stride(3), + y.stride(0), + y.stride(1), + y.stride(2), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + config["NUM_KSPLIT"], + ) + return y diff --git a/aiter/ops/triton/batched_gemm_bf16.py b/aiter/ops/triton/batched_gemm_bf16.py new file mode 100644 index 0000000000000000000000000000000000000000..a763bbfb1a8e65d84e5a506dabac7d92b8a2ad3e --- /dev/null +++ b/aiter/ops/triton/batched_gemm_bf16.py @@ -0,0 +1,262 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _batched_gemm_bf16_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + bias_ptr, + # Matrix dimensions + M, + N, + K, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ab, + stride_am, + stride_ak, + stride_bb, + stride_bk, + stride_bn, + stride_cb, + stride_cm, + stride_cn, + stride_biasb, + # Meta-parameters + HAS_BIAS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call batched_gemm_bf16 function + below + + Computes the matmul C[i] = A[i] x B[i] for every i in a given batch and optionally adds a bias to each result. + + Key parameters: + - A: Batch tensor A with shape (B, M, K). + - B: Batch tensor B with shape (B, K, N). + - C: Batch tensor C with shape (B, M, N). + - Bias: Bias batch tensor with shape (B, 1, N). + """ + + tl.assume(stride_ab > 0) + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bb > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cb > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_biasb > 0) + + # ----------------------------------------------------------- + # Get batch program id + batch_id = tl.program_id(axis=0) + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=1) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if GROUP_SIZE_M == 1: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + else: + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + # Cast batch id and batch dimension strides to int64 to avoid int32 overflow during offset calculation + # Note: If you're attempting to cast strides to int64 to prevent integer overflow, use `tl.cast` instead of `.to()`. + # See https://github.com/ROCm/aiter/pull/597 for rationale + batch_id = tl.cast(batch_id, tl.int64) + stride_ab = tl.cast(stride_ab, tl.int64) + stride_bb = tl.cast(stride_bb, tl.int64) + stride_cb = tl.cast(stride_cb, tl.int64) + + # Create pointers for first block of A and B input matrices + offs_k = tl.arange(0, BLOCK_SIZE_K) + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + ( + batch_id * stride_ab + + offs_am[:, None] * stride_am + + offs_k[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + batch_id * stride_bb + + offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + + acc_dtype = tl.float32 if c_ptr.type.element_ty != tl.int8 else tl.int32 + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs) + else: + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + + accumulator += tl.dot(a, b, input_precision="ieee") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + # Add bias + if HAS_BIAS: + offs_bias = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + bias = tl.load(bias_ptr + batch_id * stride_biasb + offs_bias) + accumulator = accumulator.to(bias_ptr.type.element_ty) + bias[None, :] + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = ( + c_ptr + + stride_cb * batch_id + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + tl.store(c_ptrs, c, mask=c_mask) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-BATCHED_GEMM-A16W16.json" + print(f"fpath={fpath}") + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + if M + N >= 4096: + return _get_config._config_dict["large"] + else: + return _get_config._config_dict["small"] + + +def batched_gemm_bf16( + XQ: torch.Tensor, + WQ: torch.Tensor, + bias: Optional[torch.Tensor] = None, + dtype: Optional[torch.dtype] = torch.bfloat16, + splitK: Optional[int] = None, + YQ: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul YQ[i] = XQ[i] x WQ[i]T for every i in a given batch and optionally adds a bias to each result. + + Key parameters: + - XQ: Batch tensor XQ with shape (B, M, K). + - WQ: Batch tensor WQ with shape (B, N, K). + - Bias: Bias batch tensor with shape (B, 1, N). + - YQ: Output Matrix Y with shape (B, M, N). If this is none, then it's created by this API and returned as output + + Returns: + - YQ: The output batch tensor with shape (B, M, N). + """ + + # Make sure XQ and WQ are contiguous in memory + XQ = XQ.contiguous() + WQ = WQ.contiguous() + + # Check constraints. + assert XQ.shape[0] == WQ.shape[0], "Incompatible Batch dimensions!!!" + assert XQ.shape[2] == WQ.shape[2], "Incompatible K dimensions!!!" + assert dtype in [ + torch.bfloat16, + torch.float16, + ], f"Output {dtype=} is currently not supported in batched_gemm_bf16" + assert splitK == None, "Currently, there isn't any support for splitK on Triton" + + # Transpose N and K dimensions of WQ: (B, N, K) -> (B, K, N) + WQ = WQ.transpose(1, 2) + + B = XQ.shape[0] + M = XQ.shape[1] + K = XQ.shape[2] + N = WQ.shape[2] + + has_bias = bias is not None + if YQ is None: + YQ = torch.empty((B, M, N), dtype=dtype, device=XQ.device) + + if config is None: + config = _get_config(M, N, K) + + grid = lambda META: ( # noqa: E731 + B, + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + + _batched_gemm_bf16_kernel[grid]( + XQ, + WQ, + YQ, + bias, + M, + N, + K, + XQ.stride(0), + XQ.stride(1), + XQ.stride(2), + WQ.stride(0), + WQ.stride(1), + WQ.stride(2), + YQ.stride(0), + YQ.stride(1), + YQ.stride(2), + bias.stride(0) if has_bias else 0, + has_bias, + **config, + ) + + return YQ diff --git a/aiter/ops/triton/chunked_pa_prefill.py b/aiter/ops/triton/chunked_pa_prefill.py new file mode 100644 index 0000000000000000000000000000000000000000..d0d43145caf45fd689970aa68464b3e7e800d25b --- /dev/null +++ b/aiter/ops/triton/chunked_pa_prefill.py @@ -0,0 +1,503 @@ +# SPDX-License-Identifier: MIT + +# The kernel in this file is adapted from the VLLM project: +# https://github.com/ROCm/vllm/blob/aiter_integration_final/vllm/attention/ops/chunked_prefill_paged_decode.py + +# Authors: +# - Burkhard Ringlein +# - Jan van Lunteren +# - Thomas Parnell + +import os +import json +import torch +import functools +from typing import Any, Dict, Optional, List +import triton +import triton.language as tl +from triton.utils import annotate_hint +from aiter.ops.triton.pa_prefill import context_attention_fwd +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import aiter.ops.triton.utils.arch_info as arch_info +from aiter import logger +from ast import literal_eval + +NUM_WARPS=4 + +@triton.jit +def cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def _kernel_paged_attention_2d( + output_ptr, # [num_tokens, num_query_heads, head_size] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] + value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + num_queries_per_kv_padded: tl.constexpr, # int + block_table_stride, # int + query_stride_0: tl.constexpr, # int + query_stride_1: tl.constexpr, # int, should be equal to head_size + output_stride_0: tl.constexpr, # int + output_stride_1: tl.constexpr, # int, should be equal to head_size + BLOCK_SIZE: tl.constexpr, # int + CACHE_BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + x: tl.constexpr, # int + stride_k_cache_0: tl.constexpr, # int + stride_k_cache_1: tl.constexpr, # int + stride_k_cache_2: tl.constexpr, # int + stride_k_cache_3: tl.constexpr, # int + stride_k_cache_4: tl.constexpr, # int + stride_v_cache_0: tl.constexpr, # int + stride_v_cache_1: tl.constexpr, # int + stride_v_cache_2: tl.constexpr, # int + stride_v_cache_3: tl.constexpr, # int + SKIP_PREFILL: tl.constexpr, # bool + USE_MATRIX_LOAD: tl.constexpr, + query_start_len_ptr, # [num_seqs+1] + HEAD_DIM_PAD_REQ: tl.constexpr, # bool +): + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + + tl.static_assert(CACHE_BLOCK_SIZE % BLOCK_SIZE == 0, "CACHE_BLOCK_SIZE must be divisible by BLOCK_SIZE") + + if SKIP_PREFILL: + cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) + cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + + 1) + cur_batch_query_len = cur_batch_in_all_stop_index \ + - cur_batch_in_all_start_index + if cur_batch_query_len > 1: + return + else: + cur_batch_in_all_start_index = seq_idx + + query_head_idx = kv_head_idx * num_queries_per_kv + tl.arange( + 0, num_queries_per_kv_padded) + head_mask = query_head_idx < (kv_head_idx + 1) * num_queries_per_kv + head_mask = head_mask & (query_head_idx < num_query_heads) + + query_offset = (cur_batch_in_all_start_index * query_stride_0 + + query_head_idx[:, None] * query_stride_1) + # Q : (num_queries_per_kv, HEAD_SIZE,) + if HEAD_DIM_PAD_REQ: + dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, + 0).to(tl.int1) + Q = tl.load( + query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + mask=dim_mask[None, :] & head_mask[:, None], + other=0.0, + ) + else: + Q = tl.load( + query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + mask=head_mask[:, None], + other=0.0, + ) + + block_table_offset = seq_idx * block_table_stride + + M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32) + L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32) + acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED], + dtype=tl.float32) + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + seq_len = annotate_hint(seq_len, "non-negative") + + # alibi slope for this head + if USE_ALIBI_SLOPES: + alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx, + mask=head_mask, + other=0.0) + + # iterate through tiles + for start_n in range(0, seq_len, BLOCK_SIZE): + physical_block_idx = tl.load(block_tables_ptr + block_table_offset + (start_n // CACHE_BLOCK_SIZE)) + physical_block_idx = annotate_hint(physical_block_idx, "non-negative") + offs_n = tl.arange(0, BLOCK_SIZE) + offs_d = tl.arange(0, HEAD_SIZE_PADDED) + + if BLOCK_SIZE == CACHE_BLOCK_SIZE: + if USE_MATRIX_LOAD: + v_offset = (physical_block_idx * stride_v_cache_0 + + kv_head_idx * stride_v_cache_1) + else: + v_offset = (physical_block_idx * stride_v_cache_0 + + kv_head_idx * stride_v_cache_1 + + offs_d[None, :] * stride_v_cache_2 + + offs_n[:, None] * stride_v_cache_3) + + k_offset = (physical_block_idx * stride_k_cache_0 + + kv_head_idx * stride_k_cache_1 + + (offs_d[:, None] // x) * stride_k_cache_2 + + offs_n[None, :] * stride_k_cache_3 + + (offs_d[:, None] % x) * stride_k_cache_4) + else: + if USE_MATRIX_LOAD: + v_offset = (physical_block_idx * stride_v_cache_0 + + kv_head_idx * stride_v_cache_1) + else: + v_offset = (physical_block_idx * stride_v_cache_0 + + kv_head_idx * stride_v_cache_1 + + offs_d[None, :] * stride_v_cache_2 + + ((start_n + offs_n[:, None]) % CACHE_BLOCK_SIZE) * stride_v_cache_3) + + k_offset = (physical_block_idx * stride_k_cache_0 + + kv_head_idx * stride_k_cache_1 + + (offs_d[:, None] // x) * stride_k_cache_2 + + ((start_n + offs_n[None, :]) % CACHE_BLOCK_SIZE) * stride_k_cache_3 + + (offs_d[:, None] % x) * stride_k_cache_4) + # K : (HEAD_SIZE, BLOCK_SIZE) + if HEAD_DIM_PAD_REQ: + K_load = tl.load(key_cache_ptr + k_offset, + mask=dim_mask[:, None], + other=0.0) + else: + K_load = tl.load(key_cache_ptr + k_offset) + + if K_load.dtype.is_fp8(): + K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype) + else: + K = K_load + + # V : (BLOCK_SIZE, HEAD_SIZE) + if USE_MATRIX_LOAD: + if HEAD_DIM_PAD_REQ: + V_load = tl.matrix_load( + value_cache_ptr + v_offset, + shape=[CACHE_BLOCK_SIZE, HEAD_SIZE], + strides=[stride_v_cache_3, stride_v_cache_2], + block_shape=[BLOCK_SIZE, HEAD_SIZE_PADDED], + offsets=[(start_n % CACHE_BLOCK_SIZE).to(tl.int32), 0], + boundary_check=(1,)) + else: + V_load = tl.matrix_load( + value_cache_ptr + v_offset, + shape=[CACHE_BLOCK_SIZE, HEAD_SIZE], + strides=[stride_v_cache_3, stride_v_cache_2], + block_shape=[BLOCK_SIZE, HEAD_SIZE_PADDED], + offsets=[(start_n % CACHE_BLOCK_SIZE).to(tl.int32), 0]) + else: + if HEAD_DIM_PAD_REQ: + V_load = tl.load(value_cache_ptr + v_offset, + mask=dim_mask[None, :], + other=0.0) + else: + V_load = tl.load(value_cache_ptr + v_offset) + + if V_load.dtype.is_fp8(): + V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype) + else: + V = V_load + + seq_offset = start_n + tl.arange(0, BLOCK_SIZE) + boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32) + seq_mask = seq_offset < boundary + # S : (num_queries_per_kv, BLOCK_SIZE,) + S = tl.where(head_mask[:, None] & seq_mask[None, :], 0.0, + float("-inf")).to(tl.float32) + S += scale * tl.dot(Q, K) + + context_len = seq_len - 1 + + if SLIDING_WINDOW > 0: + S = tl.where((context_len - seq_offset) < SLIDING_WINDOW, S, + -10000) + + if USE_ALIBI_SLOPES: + S += alibi_slope[:, None] * (seq_offset - context_len) + + # compute running maximum + # m_j : (num_queries_per_kv,) + m_j = tl.maximum(M, tl.max(S, axis=1)) + + # P : (num_queries_per_kv, BLOCK_SIZE,) + P = tl.exp(S - m_j[:, None]) + + # l_j : (num_queries_per_kv,) + l_j = tl.sum(P, axis=1) + + # alpha : (num_queries_per_kv, ) + alpha = tl.exp(M - m_j) + + # acc : (num_queries_per_kv, BLOCK_SIZE,) + acc = acc * alpha[:, None] + + # update constants + L = L * alpha + l_j + M = m_j + + # acc : (num_queries_per_kv, BLOCK_SIZE,) + acc += tl.dot(P.to(V.dtype), V) + + # epilogue + acc = acc / L[:, None] + + output_offset = (cur_batch_in_all_start_index * output_stride_0 + + query_head_idx * output_stride_1) + + if HEAD_DIM_PAD_REQ: + tl.store( + output_ptr + output_offset[:, None] + + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + acc, + mask=dim_mask[None, :] & head_mask[:, None], + ) + else: + tl.store( + output_ptr + output_offset[:, None] + + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + acc, + mask=head_mask[:, None], + ) + +@functools.lru_cache +def find_block(a, b): + if a < 16 or b < 16: + return None + # 找到小于等于b的最大2的幂 + max_power = (b).bit_length() - 1 + # 从大到小检查2的幂 + for k in range(max_power, 3, -1): + power = 1 << k + if a % power == 0: + return power + return None + +@functools.lru_cache +def get_paged_attention_2d_config_filepath(cache_block_size, head_size, slide_window, + use_alibi_slopes, filter_by_query_len, + kv_dtype, **kwargs) -> str: + kv_type = "auto" + if kv_dtype == torch.float8_e4m3fn or kv_dtype == torch.float8_e5m2: + kv_type = "fp8" + device_name = arch_info.get_arch() + head_size_padded = triton.next_power_of_2(head_size) + head_size_pad_need = head_size != head_size_padded + json_file_name = ( + f"paged_attention_2d-device={device_name}" + f"-CACHE_BLOCK_SIZE={cache_block_size}" + f"-HEAD_SIZE_PADDED={head_size_padded}" + f"-SLIDING_WINDOW={slide_window}" + f"-USE_ALIBI_SLOPES={use_alibi_slopes}" + f"-HEAD_DIM_PAD_REQ={head_size_pad_need}" + f"-kv_dtype={kv_type}.json" + ) + + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "paged_attention_2d", json_file_name + ) + return config_file_path + +@functools.lru_cache +def get_paged_attention_2d_config( + cache_block_size, + head_size, + num_querys_per_kv, + slide_window, + use_alibi_slopes, + filter_by_query_len, + kv_dtype +) -> Optional[Dict]: + config_file_path = get_paged_attention_2d_config_filepath(cache_block_size, head_size, + slide_window, use_alibi_slopes, + filter_by_query_len, kv_dtype) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + configs = {int(key): val for key, val in json.load(f)["config"].items()} + if configs: + num_querys_per_kv_padded = triton.next_power_of_2(num_querys_per_kv) + config = configs[min(configs.keys(), key=lambda x: abs(x - num_querys_per_kv_padded))] + # logger.info(f"paged_attention_2d use kernel config from:{config_file_path}") + return config + # If no optimized configuration is available, we will use the default + logger.warning( + f"\nUsing default paged_attention_2d kernel config. Performance might " + f"be sub-optimal! Config not found at {config_file_path}") + return None + +@torch.inference_mode() +def paged_attention_2d( + query, + output, + kv_cache_dtype, + key_cache, + value_cache, + block_table, + query_start_loc, + seq_lens, + k_scale, + v_scale, + alibi_slopes=None, + sliding_window=None, + sm_scale=None, + filter_by_query_len=True, +): + if sm_scale is None: + sm_scale = 1.0 / (query.shape[1]**0.5) + use_alibi_slopes = alibi_slopes is not None + + if sliding_window is None or sliding_window <= 0: + sliding_window = 0 + + cache_block_size = value_cache.shape[3] + head_size = query.shape[2] + head_size_padded = triton.next_power_of_2(head_size) + + num_seqs = len(seq_lens) + num_query_heads = query.shape[1] + num_kv_heads = key_cache.shape[1] + num_queries_per_kv = query.shape[1] // key_cache.shape[1] + num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv), 16) + + if "fp8" in kv_cache_dtype and (key_cache.dtype == torch.uint8 or value_cache.dtype == torch.uint8): + # kv_cache may view as uint8 + if kv_cache_dtype in ("fp8", "fp8e4m3"): + target_dtype = torch.float8_e4m3fn + elif kv_cache_dtype == "fp8e5m2": + target_dtype = torch.float8_e5m2 + else: + raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype) + key_cache = key_cache.view(target_dtype) + value_cache = value_cache.view(target_dtype) + + config = get_paged_attention_2d_config(cache_block_size, head_size, num_queries_per_kv, + sliding_window, use_alibi_slopes, filter_by_query_len, key_cache.dtype) + if not config: + config = {'num_warps': 4, 'num_stages': 1, 'USE_MATRIX_LOAD': False} + if 'BLOCK_SIZE' not in config: + cache_ele_size = key_cache.element_size() + block_size = cache_block_size + if block_size * head_size_padded * cache_ele_size > 16384: # 64 * 128 * 2 + block_size = (16384 // (head_size_padded * cache_ele_size)) + block_size = find_block(cache_block_size, block_size) + assert block_size != None, "can not find suitable block_size size for kernel_paged_attention_2d" + assert (cache_ele_size >= 2 or block_size >= 32), "Block size must be at least 32 for fp8" + config['BLOCK_SIZE'] = block_size + + assert cache_block_size % config['BLOCK_SIZE'] == 0, "cache_block_size % block_size need be 0." + # print(f"kernel_paged_attention_2d: {num_queries_per_kv=} {config=}") + + _kernel_paged_attention_2d[( + num_seqs, + num_kv_heads, + )]( + output_ptr=output, + query_ptr=query, + key_cache_ptr=key_cache, + value_cache_ptr=value_cache, + block_tables_ptr=block_table, + seq_lens_ptr=seq_lens, + alibi_slopes_ptr=alibi_slopes, + scale=sm_scale, + k_scale=k_scale, + v_scale=v_scale, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + num_queries_per_kv_padded=num_queries_per_kv_padded, + block_table_stride=block_table.stride(0), + query_stride_0=query.stride(0), + query_stride_1=query.stride(1), + output_stride_0=output.stride(0), + output_stride_1=output.stride(1), + CACHE_BLOCK_SIZE=cache_block_size, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=head_size_padded, + USE_ALIBI_SLOPES=use_alibi_slopes, + SLIDING_WINDOW=sliding_window, + x=key_cache.shape[4], + stride_k_cache_0=key_cache.stride(0), + stride_k_cache_1=key_cache.stride(1), + stride_k_cache_2=key_cache.stride(2), + stride_k_cache_3=key_cache.stride(3), + stride_k_cache_4=key_cache.stride(4), + stride_v_cache_0=value_cache.stride(0), + stride_v_cache_1=value_cache.stride(1), + stride_v_cache_2=value_cache.stride(2), + stride_v_cache_3=value_cache.stride(3), + SKIP_PREFILL=filter_by_query_len, + query_start_len_ptr=query_start_loc, + HEAD_DIM_PAD_REQ=(head_size != head_size_padded), + **config, + ) + + +def chunked_prefill_paged_decode( + query, + key, + value, + output, + kv_cache_dtype, + key_cache, + value_cache, + block_table, + query_start_loc, + seq_lens, + max_query_len, + k_scale, + v_scale, + alibi_slopes=None, + sliding_window=None, + sm_scale=None, +): + if sm_scale is None: + sm_scale = 1.0 / (query.shape[1]**0.5) + + use_alibi_slopes = alibi_slopes is not None + + if sliding_window is None or sliding_window <= 0: + sliding_window = 0 + + if max_query_len > 1: + context_attention_fwd( + q=query, + k=key, + v=value, + o=output, + kv_cache_dtype=kv_cache_dtype, + k_cache=key_cache, + v_cache=value_cache, + b_loc=block_table, + b_start_loc=query_start_loc, + b_seq_len=seq_lens, + max_input_len=max_query_len, + k_scale=k_scale, + v_scale=v_scale, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + sm_scale=sm_scale, + skip_decode=True, + ) + + paged_attention_2d( + query, + output, + kv_cache_dtype, + key_cache, + value_cache, + block_table, + query_start_loc, + seq_lens, + k_scale, + v_scale, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + sm_scale=sm_scale, + filter_by_query_len=True, + ) diff --git a/aiter/ops/triton/configs/BW200-EXTEND_ATTENTION-FP16.json b/aiter/ops/triton/configs/BW200-EXTEND_ATTENTION-FP16.json new file mode 100644 index 0000000000000000000000000000000000000000..95f7e5af09671cc79e20b1112e7ce176acdb4b15 --- /dev/null +++ b/aiter/ops/triton/configs/BW200-EXTEND_ATTENTION-FP16.json @@ -0,0 +1,198 @@ +{ + "config": { + "(4, 48, 32, False, True)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(5, 30, 17, False, True)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(5, 17, 17, False, True)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(5, 19, 19, False, True)": { + "BLOCK_M": 64, + "BLOCK_N": 32, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(16, 128, 128, False, True)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(1, 192, 128, False, True)": { + "BLOCK_M": 16, + "BLOCK_N": 16, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "(16, 192, 128, False, True)": { + "BLOCK_M": 64, + "BLOCK_N": 32, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(16, 576, 512, False, True)": { + "BLOCK_M": 16, + "BLOCK_N": 16, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(4, 48, 32, False, False)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(5, 30, 17, False, False)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(5, 17, 17, False, False)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(5, 19, 19, False, False)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(16, 128, 128, False, False)": { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(1, 192, 128, False, False)": { + "BLOCK_M": 16, + "BLOCK_N": 16, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "(16, 192, 128, False, False)": { + "BLOCK_M": 64, + "BLOCK_N": 32, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "(16, 576, 512, False, False)": { + "BLOCK_M": 16, + "BLOCK_N": 16, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } + }, + "path": { + "(4, 48, 32, False, True)": "35VPVAGZ5TEHWYDL6RTHWN6NXDZHY3DCUGG475ILKUDOL2TETCBQ", + "(5, 30, 17, False, True)": "H3M224TOOZQCBXRBKAWMVPBZM6UJ5Z6Z4KDHCMMKDKMKLPGPQDLQ", + "(5, 17, 17, False, True)": "4NSFERLCVSHQ62SCOIWHUUFZUMLPWWGEC5XKHBC3C5GVZY3AUY2A", + "(5, 19, 19, False, True)": "XRNUN4G6GBQJ6K6V5IJFS2GSJQZ5ZWU3FITB6UB43B7EQSZBMZZA", + "(16, 128, 128, False, True)": "NDVOD6G7KJCHCZVV7JHIDIUMS6ZV5EBPDSXJMVVKRZWFCYK6JPMQ", + "(1, 192, 128, False, True)": "BWUNRCHFYCEBDCVFDNAS2SEQFKX7D4UGJDAFJHNNSH756VDSUSWQ", + "(16, 192, 128, False, True)": "OMR6FAAM57B4TDKKZTXYHY2JICENF4EGKNEW6R2CLSIZENM62RYQ", + "(16, 576, 512, False, True)": "RHHIVAK5FQ5BA26Q4BIMSNESSTUUWT3ES63FDZ3NBEUBDIF4HBOA", + "(4, 48, 32, False, False)": "QQ3CJEATF7VF53WV2NBICAMQJZ2XWLTHWQLTZEAKB55EBOWKQAVA", + "(5, 30, 17, False, False)": "PWUSNPBVMCQQQGT6PGWDFCLM6BMXSSRZHNKILQFBE4YDRVAQ4LYA", + "(5, 17, 17, False, False)": "MF3UHLVBEMGLSMKP2ERUWBTOVV3F4CAXQFYPEBBNXV7IJO7JZOOQ", + "(5, 19, 19, False, False)": "LPTL4V5E5HUA53Q4LRS2QCTTF2WMHB5DF52A42BODLLCD6S7OW3A", + "(16, 128, 128, False, False)": "66S5D3ATXUMQYSYJLHGEXGVKRRZFU6YSDNS33D5WYR5HPXCRKONA", + "(1, 192, 128, False, False)": "FJKN5JEJRSZ3XEILI756XIWDXKGSWCK4P372GAVQ5M55RLUDL3LA", + "(16, 192, 128, False, False)": "QR6QRRTTDCOIDKU67C7MDV44Y6EJBVUNAMHGVHJRTVGSCPIVDWKQ", + "(16, 576, 512, False, False)": "AXL3BOKE3LCNFFY4VFKAEJKMKV5IESOMF2CVA4RBRXKCR5DM7KMQ" + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/BW200-GROUPED_DECODE_ATTENTION-STAGE1-FP16.json b/aiter/ops/triton/configs/BW200-GROUPED_DECODE_ATTENTION-STAGE1-FP16.json new file mode 100644 index 0000000000000000000000000000000000000000..01fa2855bf56ae35e70d31c28a8a7d350e6e0f63 --- /dev/null +++ b/aiter/ops/triton/configs/BW200-GROUPED_DECODE_ATTENTION-STAGE1-FP16.json @@ -0,0 +1,18 @@ +{ + "config": { + "(16, 16, 576, 512)": { + "BLOCK_N": 16, + "BLOCK_H": 16, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } + }, + "path": { + "(16, 16, 576, 512)": "D43F2CZASM7U7Z5MYGGCGKPTK2NQGFIONHS77X4CHCBAYQNNPQAA" + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/BW200-GROUPED_DECODE_ATTENTION-STAGE2-FP16.json b/aiter/ops/triton/configs/BW200-GROUPED_DECODE_ATTENTION-STAGE2-FP16.json new file mode 100644 index 0000000000000000000000000000000000000000..3a25f42218f76412e22af263168db1886b104139 --- /dev/null +++ b/aiter/ops/triton/configs/BW200-GROUPED_DECODE_ATTENTION-STAGE2-FP16.json @@ -0,0 +1,16 @@ +{ + "config": { + "(16, 512)": { + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 3 + } + }, + "path": { + "(16, 512)": "JPMBEYJ32U34XLUV2LZ466BSCQQL4NEHCHFCODHZYM27PZKPCY7Q" + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/README b/aiter/ops/triton/configs/README new file mode 100644 index 0000000000000000000000000000000000000000..85970e2d1cea5b5dedb55c03562d093f3d439503 --- /dev/null +++ b/aiter/ops/triton/configs/README @@ -0,0 +1,12 @@ +This directory contains tuned configurations for different settings of the fused_moe kernel. +For different settings of +- E (number of experts) +- N (intermediate size) +- device_name (torch.cuda.get_device_name()) +the JSON file contains a mapping from M (batch size) to the chosen configuration. + +The example configurations provided are for the Mixtral model for TP2 on H100 +and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have +N = 7168 and for TP4 we have N = 3584. + +See `benchmark/kernels/benchmark_moe.py` on how to generate these config files. diff --git a/aiter/ops/triton/configs/gemm/MI300X-BATCHED_GEMM-A16W16.json b/aiter/ops/triton/configs/gemm/MI300X-BATCHED_GEMM-A16W16.json new file mode 100644 index 0000000000000000000000000000000000000000..947fe026c7fbcd17ad9eceadf5e8bf3412a817db --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI300X-BATCHED_GEMM-A16W16.json @@ -0,0 +1,24 @@ +{ + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + }, + "small" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + } +} + + \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI300X-BATCHED_GEMM-A8W8.json b/aiter/ops/triton/configs/gemm/MI300X-BATCHED_GEMM-A8W8.json new file mode 100644 index 0000000000000000000000000000000000000000..947fe026c7fbcd17ad9eceadf5e8bf3412a817db --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI300X-BATCHED_GEMM-A8W8.json @@ -0,0 +1,24 @@ +{ + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + }, + "small" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + } +} + + \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI300X-GEMM-A16W16-ATOMIC.json b/aiter/ops/triton/configs/gemm/MI300X-GEMM-A16W16-ATOMIC.json new file mode 100644 index 0000000000000000000000000000000000000000..ab6ffdb6f3118d9a1af2efb7f18ca16958c5823b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI300X-GEMM-A16W16-ATOMIC.json @@ -0,0 +1,15 @@ +{ + "any": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "NUM_KSPLIT":1, + "cache_modifier": null, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI300X-GEMM-A16W16.json b/aiter/ops/triton/configs/gemm/MI300X-GEMM-A16W16.json new file mode 100644 index 0000000000000000000000000000000000000000..08ba04a4700759b31026507cfce752eeb857b836 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI300X-GEMM-A16W16.json @@ -0,0 +1,14 @@ +{ + "any": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI300X-GEMM-A8W8.json b/aiter/ops/triton/configs/gemm/MI300X-GEMM-A8W8.json new file mode 100644 index 0000000000000000000000000000000000000000..2c731811f4570efcfcce11bd013500fc3787d057 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI300X-GEMM-A8W8.json @@ -0,0 +1,13 @@ +{ + "any": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI300X-GEMM_BLOCKSCALE-A8W8.json b/aiter/ops/triton/configs/gemm/MI300X-GEMM_BLOCKSCALE-A8W8.json new file mode 100644 index 0000000000000000000000000000000000000000..2c731811f4570efcfcce11bd013500fc3787d057 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI300X-GEMM_BLOCKSCALE-A8W8.json @@ -0,0 +1,13 @@ +{ + "any": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-A16W16.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-A16W16.json new file mode 100644 index 0000000000000000000000000000000000000000..947fe026c7fbcd17ad9eceadf5e8bf3412a817db --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-A16W16.json @@ -0,0 +1,24 @@ +{ + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + }, + "small" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + } +} + + \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-A8W8.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-A8W8.json new file mode 100644 index 0000000000000000000000000000000000000000..947fe026c7fbcd17ad9eceadf5e8bf3412a817db --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-A8W8.json @@ -0,0 +1,24 @@ +{ + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + }, + "small" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16 + } +} + + \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4-N=128-K=512.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4-N=128-K=512.json new file mode 100644 index 0000000000000000000000000000000000000000..0894676d4377422e5a246f12e6929a424e0c7c0c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4-N=128-K=512.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4-N=512-K=128.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4-N=512-K=128.json new file mode 100644 index 0000000000000000000000000000000000000000..75b343e1ffd9d0ce775a16609d25dc808cbb4a2a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4-N=512-K=128.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4.json new file mode 100644 index 0000000000000000000000000000000000000000..060cd8ff448a37e39c57000b1f7dc9a8e8474ea3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM-AFP4WFP4.json @@ -0,0 +1,82 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32" : { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64" : { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} + + \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=128-K=512.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=128-K=512.json new file mode 100644 index 0000000000000000000000000000000000000000..6ef605d8716e514cd5ea2b5113737edde70ba204 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=128-K=512.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=512-K=128.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=512-K=128.json new file mode 100644 index 0000000000000000000000000000000000000000..18a2a71315c5562b70a3b9fbc179d66c64801483 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4-N=512-K=128.json @@ -0,0 +1,74 @@ +{ + "small": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4.json b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4.json new file mode 100644 index 0000000000000000000000000000000000000000..060cd8ff448a37e39c57000b1f7dc9a8e8474ea3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-BATCHED_GEMM_PREQUANT-AFP4WFP4.json @@ -0,0 +1,82 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32" : { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64" : { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} + + \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-ATOMIC-N=256-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-ATOMIC-N=256-K=7168.json new file mode 100644 index 0000000000000000000000000000000000000000..2d6c94b04ead01236c0f47dc5571d95384722496 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-ATOMIC-N=256-K=7168.json @@ -0,0 +1,80 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "NUM_KSPLIT": 14, + "cache_modifier": ".cg", + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "NUM_KSPLIT": 14, + "cache_modifier": ".cg", + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "NUM_KSPLIT": 14, + "cache_modifier": ".cg", + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "NUM_KSPLIT": 14, + "cache_modifier": ".cg", + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "large": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "NUM_KSPLIT": 14, + "cache_modifier": ".cg", + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "NUM_KSPLIT":1, + "cache_modifier": null, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 32, + "kpack": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-ATOMIC.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-ATOMIC.json new file mode 100644 index 0000000000000000000000000000000000000000..7998ad7b7958855f3f5a87cf791d98fefd3c62df --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-ATOMIC.json @@ -0,0 +1,15 @@ +{ + "any": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "NUM_KSPLIT":1, + "cache_modifier": null, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 32, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json new file mode 100644 index 0000000000000000000000000000000000000000..a25e2cad395ae0046ce64ea7c6283c580e8fa950 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json @@ -0,0 +1,26 @@ +{ + "any": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "kpack": 1 + }, + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "kpack": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16.json new file mode 100644 index 0000000000000000000000000000000000000000..a072abb2a4043188250fd388261c04fd79a8a4fe --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16.json @@ -0,0 +1,14 @@ +{ + "any": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A8W8.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A8W8.json new file mode 100644 index 0000000000000000000000000000000000000000..7ec9abb7629657c4ac85717ac16f438e324f5b5f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A8W8.json @@ -0,0 +1,13 @@ +{ + "any": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A8WFP4.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A8WFP4.json new file mode 100644 index 0000000000000000000000000000000000000000..03f8117ec378bd5a92fbc34aace43d76a206e4b7 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A8WFP4.json @@ -0,0 +1,82 @@ +{ + "M_LT_32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "M_EQ_32" : { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "M_33_64" : { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "M_65_128" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "M_129_256" : { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "default": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 32, + "kpack": 1, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} + + \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=106496-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=106496-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..0d14f1f091ce74a17ee72bf7adce4d21aae8d512 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=106496-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=1280-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=1280-K=8192.json new file mode 100644 index 0000000000000000000000000000000000000000..5a78bc76792ba4143724cbc163015ee1ada586a6 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=1280-K=8192.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 16 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 2 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 2 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=13312-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=13312-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..9924627c0392e45ddd3ea4e51a23e9572cbd66d5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=13312-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=13312.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=13312.json new file mode 100644 index 0000000000000000000000000000000000000000..8b9f77f0c27eb451a0e7394b2f11d50cb8d9cf5f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=13312.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..d03ad23027e50ffedee90cc06007521e1409a22f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=2048.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=2048.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ace9bcaaf62d940cefd6f825bbe78c06b5a258 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=2048.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=26624.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=26624.json new file mode 100644 index 0000000000000000000000000000000000000000..aa1d4209a8cd2829e9ce06e6929fe358601cad39 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=26624.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=4096.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=4096.json new file mode 100644 index 0000000000000000000000000000000000000000..0c41e6f5ce5fdb55e561b9a37a7ed89a7851b4d4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=4096.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=53248.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=53248.json new file mode 100644 index 0000000000000000000000000000000000000000..5ad122387211369b48038ddf1feda466362c7067 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=53248.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=6656.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=6656.json new file mode 100644 index 0000000000000000000000000000000000000000..7e843287a630bc38618ff466dadf199b9b1b42f0 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=6656.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=8192.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=8192.json new file mode 100644 index 0000000000000000000000000000000000000000..9980103e769cb13c7ab309c3f57106653010e6e1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=16384-K=8192.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=18432-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=18432-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..09bbcc147260ebc3b836af30cf5d505fb44122a5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=18432-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=2112-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=2112-K=7168.json new file mode 100644 index 0000000000000000000000000000000000000000..87836a2dea867712694dc01288be9600120fabf6 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=2112-K=7168.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=2304-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=2304-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..686d68fd462afbe69731f9d3fc7dcd55f5a346d9 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=2304-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 2, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 2, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 2, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=26624-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=26624-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..8d6467775f8b412f420ffebd2ac68c666af79745 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=26624-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=3072-K=1536.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=3072-K=1536.json new file mode 100644 index 0000000000000000000000000000000000000000..3efc1323292e2a2380e3f1374446ada8c84f1438 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=3072-K=1536.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=4608-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=4608-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..6c55b84e4e290a19d993d5bb51a19e949937a6f9 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=4608-K=16384.json @@ -0,0 +1,74 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=512-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=512-K=7168.json new file mode 100644 index 0000000000000000000000000000000000000000..3a2e463c75524fc76546e3ea2b4e3e40c6e19c08 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=512-K=7168.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=53248-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=53248-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..cf1a57caf6179bf98990a7cdef01b17b3be9a3dc --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=53248-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 2 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=7168-K=2048.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=7168-K=2048.json new file mode 100644 index 0000000000000000000000000000000000000000..db5d1308f04ef6e3b53b07773e7feb7ea3b14dc8 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=7168-K=2048.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 1024, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 2, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=7168-K=256.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=7168-K=256.json new file mode 100644 index 0000000000000000000000000000000000000000..9f80e9437c9070570fcd8d4a1dee993f18e63a97 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=7168-K=256.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=9216-K=16384.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=9216-K=16384.json new file mode 100644 index 0000000000000000000000000000000000000000..c310d83a8e369683c55125dc8decf306a82d0e9a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4-N=9216-K=16384.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4.json new file mode 100644 index 0000000000000000000000000000000000000000..da2f6035b44411a5ca752b190b68a4b5bc7700d9 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-AFP4WFP4.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 16 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_stages": 3, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 2, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM_BLOCKSCALE-A8W8.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM_BLOCKSCALE-A8W8.json new file mode 100644 index 0000000000000000000000000000000000000000..5c24b1495a50ebbd6bdaa4a1b0fd652a7026f3bc --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM_BLOCKSCALE-A8W8.json @@ -0,0 +1,13 @@ +{ + "any": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM_PREQUANT-AFP4WFP4-N=512-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM_PREQUANT-AFP4WFP4-N=512-K=7168.json new file mode 100644 index 0000000000000000000000000000000000000000..f2a37990bc7cb21b82afe7590b8ba8fb4fcfaa90 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM_PREQUANT-AFP4WFP4-N=512-K=7168.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "medium_M32": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "medium_M64": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "medium_M128": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "large": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "xlarge": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM_PREQUANT-AFP4WFP4.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM_PREQUANT-AFP4WFP4.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b3d19f7f590ddc06dd686730ccd270cfcdcb87 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM_PREQUANT-AFP4WFP4.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M128": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + } + +} diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..e5996c2ed8e8ab8d3c318b638b10b0365e518e0e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2601 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 1536 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 1536 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 1536 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 1536 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 1536 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 1536 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 1536 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 1536 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 1536 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 1536 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 1536 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 1536 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 1536 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 1536 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 1536 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 1536 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 1536 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 1536 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 1536 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 1536 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 1536 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 1536 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 1536 + ] + }, + "36": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 1536 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 1536 + ] + }, + "38": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 1536 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 1536 + ] + }, + "40": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 1536 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 1536 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 1536 + ] + }, + "44": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 1536 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 1536 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 1536 + ] + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 1536 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 1536 + ] + }, + "50": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 1536 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 1536 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 1536 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 1536 + ] + }, + "54": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 1536 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 1536 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 1536 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 1536 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 1536 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 1536 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 1536 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 1536 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 1536 + ] + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 1536 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 1536 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 1536 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 1536 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 1536 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 1536 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 1536 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 1536 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 1536 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 1536 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 1536 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 1536 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 1536 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 1536 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 1536 + ] + }, + "80": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 80, + 1536 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 81, + 1536 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 1536 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 1536 + ] + }, + "85": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 85, + 1536 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 1536 + ] + }, + "87": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 87, + 1536 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 1536 + ] + }, + "89": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 91, + 1536 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 1536 + ] + }, + "93": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 93, + 1536 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 95, + 1536 + ] + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 114, + 1536 + ] + }, + "115": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 115, + 1536 + ] + }, + "116": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 116, + 1536 + ] + }, + "117": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..ef027354d4f1119887c7e839f29d7a47f655bbab --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2603 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 1536 + ] + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 1536 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 1536 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 1536 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 1536 + ] + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 1536 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 1536 + ] + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 1536 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 1536 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 1536 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 1536 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 1536 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 1536 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 1536 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 1536 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 1536 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 1536 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 1536 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 1536 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 1536 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 1536 + ] + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 1536 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 1536 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 1536 + ] + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 1536 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 1536 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 1536 + ] + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 1536 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 1536 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 1536 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 1536 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 1536 + ] + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 1536 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 1536 + ] + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 1536 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 1536 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 1536 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 1536 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 1536 + ] + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 1536 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 1536 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 1536 + ] + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 1536 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 1536 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 1536 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 1536 + ] + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 1536 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 1536 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 1536 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 1536 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 1536 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 1536 + ] + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 1536 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 1536 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 1536 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 1536 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 1536 + ] + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 1536 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 1536 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 1536 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 1536 + ] + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 1536 + ] + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 1536 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 1536 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 1536 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..09db8585333e2b9ba4e680b4f2eb3eab910fac38 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2573 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 1536 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 1536 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 1536 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 1536 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 1536 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 1536 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 1536 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 1536 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 1536 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 1536 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 1536 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 1536 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 1536 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 1536 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 1536 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 1536 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 1536 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 1536 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 1536 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 1536 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 1536 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 1536 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 1536 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 1536 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 1536 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 1536 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 1536 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 1536 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 1536 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 1536 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 1536 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 1536 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 1536 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 1536 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 1536 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 1536 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 1536 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 1536 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 1536 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 1536 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 1536 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 1536 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 1536 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 1536 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 1536 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 1536 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 1536 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 1536 + ] + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 65, + 1536 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 66, + 1536 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 67, + 1536 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 68, + 1536 + ] + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 69, + 1536 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 1536 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 1536 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 1536 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 1536 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 1536 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 1536 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 1536 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 1536 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 1536 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 1536 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 1536 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 1536 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 1536 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 1536 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 1536 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 1536 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 1536 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 1536 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 1536 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 1536 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 1536 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 1536 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 1536 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 1536 + ] + }, + "115": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 115, + 1536 + ] + }, + "116": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 116, + 1536 + ] + }, + "117": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..794141b6de220a9551e5c1d1d4c175c310f28a24 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2594 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 1536 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 1536 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 1536 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 1536 + ] + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 1536 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 1536 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 1536 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 1536 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 1536 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 1536 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 1536 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 1536 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 1536 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 1536 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 1536 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 1536 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 1536 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 1536 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 1536 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 1536 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 1536 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 1536 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 1536 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 1536 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 1536 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 1536 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 1536 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 1536 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 1536 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 1536 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 1536 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 1536 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 1536 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 1536 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 1536 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 1536 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 1536 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 1536 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 1536 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 1536 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 1536 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 1536 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 1536 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 1536 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 1536 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 1536 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 1536 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 1536 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 1536 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 1536 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 1536 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 1536 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 1536 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 1536 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 1536 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 1536 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 1536 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 1536 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 1536 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 1536 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 1536 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 1536 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 1536 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 1536 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 1536 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 1536 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 1536 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 1536 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 1536 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 1536 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 1536 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 1536 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 1536 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..3f7a39bbaf6ca4590bf045fdcf430d843d47aa01 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2578 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 1536 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 1536 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 1536 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 1536 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 1536 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 1536 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 1536 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 1536 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 1536 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 1536 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 1536 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 1536 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 1536 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 1536 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 1536 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 1536 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 1536 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 1536 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 1536 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 1536 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 1536 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 1536 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 1536 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 1536 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 1536 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 1536 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 1536 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 1536 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 1536 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 1536 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 1536 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 1536 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 1536 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 1536 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 1536 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 1536 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 1536 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 1536 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 1536 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 1536 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 1536 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 1536 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 1536 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 1536 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 1536 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 60, + 1536 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 1536 + ] + }, + "62": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 1536 + ] + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 65, + 1536 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 66, + 1536 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 67, + 1536 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 68, + 1536 + ] + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 69, + 1536 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 1536 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 1536 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 1536 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 1536 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 1536 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 1536 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 1536 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 1536 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 1536 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 1536 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 1536 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 1536 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 1536 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 1536 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 1536 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 1536 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 1536 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 1536 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 1536 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 1536 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 1536 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 1536 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 1536 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 1536 + ] + }, + "115": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 115, + 1536 + ] + }, + "116": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 116, + 1536 + ] + }, + "117": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..badd571908d3b6435b17c89a38cac7c46c3f18b3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2588 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 1536 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 1536 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 1536 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 1536 + ] + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 1536 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 1536 + ] + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 1536 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 1536 + ] + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 1536 + ] + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 1536 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 1536 + ] + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 1536 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 1536 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 1536 + ] + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 1536 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 1536 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 1536 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 1536 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 1536 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 1536 + ] + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 1536 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 1536 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 1536 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 1536 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 1536 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 1536 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 1536 + ] + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 1536 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 1536 + ] + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 1536 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 1536 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 1536 + ] + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 1536 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 1536 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 1536 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 1536 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 1536 + ] + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 1536 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 1536 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 1536 + ] + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 1536 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 1536 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 1536 + ] + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 1536 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 1536 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 1536 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 1536 + ] + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 1536 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 1536 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 1536 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 1536 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 1536 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 1536 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 1536 + ] + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 1536 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 1536 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 1536 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 1536 + ] + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 1536 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 1536 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 1536 + ] + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 1536 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 1536 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 1536 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 1536 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 1536 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=256,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=256,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..7283a9a25b9185ad7d1ad26d14457e454c016e00 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=256,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2588 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 1536 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 1536 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 1536 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 1536 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 1536 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 1536 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 1536 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 1536 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 1536 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 1536 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 1536 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 1536 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 1536 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 1536 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 1536 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 1536 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 1536 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 1536 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 1536 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 1536 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 1536 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 1536 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 1536 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 1536 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 1536 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 1536 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 1536 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 1536 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 1536 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 1536 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 1536 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 1536 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 1536 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 1536 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 1536 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 1536 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 1536 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 1536 + ], + "D_DTYPE": 16 + }, + "52": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 52, + 1536 + ], + "D_DTYPE": 16 + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 1536 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 1536 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 1536 + ], + "D_DTYPE": 16 + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 1536 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 1536 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 1536 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 1536 + ], + "D_DTYPE": 16 + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 1536 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 1536 + ], + "D_DTYPE": 16 + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 1536 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 1536 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 1536 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 1536 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 1536 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 1536 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 1536 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 1536 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 1536 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 1536 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 1536 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 1536 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 1536 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 1536 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 1536 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 1536 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 1536 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 1536 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 1536 + ] + }, + "81": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 81, + 1536 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 1536 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 1536 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 1536 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 1536 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 1536 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 1536 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 1536 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 1536 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 1536 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 1536 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 1536 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 1536 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 1536 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 1536 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 1536 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 1536 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 1536 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 1536 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 1536 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 1536 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 1536 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 1536 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 1536 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 1536 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 1536 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 1536 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 1536 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 1536 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 1536 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 1536 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 1536 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 1536 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 1536 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 1536 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 1536 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 1536 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 1536 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 1536 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..e24605cf07e82f9d50229696aeff835cbdad24bf --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2588 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 1536 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 1536 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 1536 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 1536 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 1536 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 1536 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 1536 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 1536 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 1536 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 1536 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 1536 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 1536 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 1536 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 1536 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 1536 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 1536 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 1536 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 1536 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 1536 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 1536 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 1536 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 1536 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 1536 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 1536 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 1536 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 1536 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 1536 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 1536 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 1536 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 1536 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 1536 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 1536 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 1536 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 1536 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 1536 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 1536 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 1536 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 1536 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 1536 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 1536 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 1536 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 1536 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 1536 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 1536 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 1536 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 1536 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 1536 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 1536 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 1536 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 1536 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 1536 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 1536 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 1536 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 1536 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 1536 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 1536 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 1536 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 1536 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 1536 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 1536 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 1536 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 1536 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 1536 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 1536 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 1536 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 1536 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 1536 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 1536 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 1536 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 1536 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 1536 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 1536 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 1536 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 1536 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 1536 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 1536 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 1536 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 1536 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 1536 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 1536 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 1536 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 1536 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 1536 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 1536 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 1536 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 1536 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 1536 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 1536 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 1536 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 1536 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 1536 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 1536 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 1536 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 1536 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 1536 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=512,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=512,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..02edd015601677211a63c0313816e4548433bf6d --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=512,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2591 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 1, + 1536 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 2, + 1536 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 3, + 1536 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 4, + 1536 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 5, + 1536 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 6, + 1536 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 7, + 1536 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 8, + 1536 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 9, + 1536 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 10, + 1536 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 11, + 1536 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 12, + 1536 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 1536 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 1536 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 1536 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 1536 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 1536 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 1536 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 1536 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 1536 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 1536 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 1536 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 1536 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 1536 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 1536 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 1536 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 1536 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 1536 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 1536 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 1536 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 1536 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 1536 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 1536 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 1536 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 1536 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 1536 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 1536 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 1536 + ], + "D_DTYPE": 16 + }, + "52": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 52, + 1536 + ], + "D_DTYPE": 16 + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 1536 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 1536 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 1536 + ], + "D_DTYPE": 16 + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 1536 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 1536 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 1536 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 1536 + ], + "D_DTYPE": 16 + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 1536 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 1536 + ], + "D_DTYPE": 16 + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 1536 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 1536 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 1536 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 65, + 1536 + ], + "D_DTYPE": 16 + }, + "66": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 1536 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 1536 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 68, + 1536 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 69, + 1536 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 70, + 1536 + ], + "D_DTYPE": 16 + }, + "71": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 71, + 1536 + ], + "D_DTYPE": 16 + }, + "72": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 72, + 1536 + ], + "D_DTYPE": 16 + }, + "73": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 73, + 1536 + ], + "D_DTYPE": 16 + }, + "74": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 74, + 1536 + ], + "D_DTYPE": 16 + }, + "75": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 75, + 1536 + ], + "D_DTYPE": 16 + }, + "76": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 76, + 1536 + ], + "D_DTYPE": 16 + }, + "77": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 77, + 1536 + ], + "D_DTYPE": 16 + }, + "78": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 78, + 1536 + ], + "D_DTYPE": 16 + }, + "79": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 79, + 1536 + ], + "D_DTYPE": 16 + }, + "80": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 80, + 1536 + ], + "D_DTYPE": 16 + }, + "81": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 81, + 1536 + ], + "D_DTYPE": 16 + }, + "82": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 82, + 1536 + ], + "D_DTYPE": 16 + }, + "83": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 83, + 1536 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 1536 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 1536 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 1536 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 1536 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 1536 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 1536 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 1536 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 1536 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 1536 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 1536 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 1536 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 1536 + ] + }, + "115": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 115, + 1536 + ], + "D_DTYPE": 16 + }, + "116": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 116, + 1536 + ], + "D_DTYPE": 16 + }, + "117": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 117, + 1536 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..33f5564b1a640eb37873c79a87196ffbf143a846 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2613 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 7, + 1536 + ], + "D_DTYPE": 16 + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 1536 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 1536 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 1536 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 1536 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 1536 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 1536 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 1536 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 1536 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 1536 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 1536 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 1536 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 1536 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 1536 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 1536 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 1536 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 1536 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 1536 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 1536 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 1536 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 1536 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 1536 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 1536 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 1536 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 1536 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 1536 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 1536 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 1536 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 1536 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 1536 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 1536 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 1536 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 1536 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 1536 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 1536 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 1536 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 1536 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 1536 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 1536 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 1536 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 1536 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 1536 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 1536 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 1536 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 1536 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 1536 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 1536 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 1536 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 1536 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 1536 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 1536 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 1536 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 1536 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 1536 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 1536 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 1536 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 1536 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 1536 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 1536 + ] + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 1536 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 1536 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 1536 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 1536 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 1536 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 1536 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 1536 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 1536 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 1536 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 1536 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 1536 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 1536 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 1536 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 1536 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 1536 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 1536 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 1536 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 1536 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 1536 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 1536 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 1536 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 1536 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 1536 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 1536 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 1536 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 1536 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 1536 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..fccd731ca28fee58f713144e2f3a48c899f523a0 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2602 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 1536 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 1536 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 1536 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 1536 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 1536 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 1536 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 1536 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 1536 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 1536 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 1536 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 1536 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 1536 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 1536 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 1536 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 1536 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 1536 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 1536 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 1536 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 1536 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 1536 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 1536 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 1536 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 34, + 1536 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 1536 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 36, + 1536 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 1536 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 38, + 1536 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 1536 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 40, + 1536 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 1536 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 42, + 1536 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 1536 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 44, + 1536 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 1536 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 46, + 1536 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 1536 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 48, + 1536 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 1536 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 50, + 1536 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 1536 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 1536 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 53, + 1536 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 54, + 1536 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 1536 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 56, + 1536 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 1536 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 58, + 1536 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 1536 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 60, + 1536 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 1536 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 62, + 1536 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 1536 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 64, + 1536 + ] + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 65, + 1536 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 66, + 1536 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 67, + 1536 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 68, + 1536 + ] + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 1536 + ] + }, + "70": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 70, + 1536 + ] + }, + "71": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 71, + 1536 + ] + }, + "72": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 72, + 1536 + ] + }, + "73": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 73, + 1536 + ] + }, + "74": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 74, + 1536 + ] + }, + "75": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 75, + 1536 + ] + }, + "76": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 76, + 1536 + ] + }, + "77": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 77, + 1536 + ] + }, + "78": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 78, + 1536 + ] + }, + "79": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 79, + 1536 + ] + }, + "80": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 80, + 1536 + ] + }, + "81": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 81, + 1536 + ] + }, + "82": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 82, + 1536 + ] + }, + "83": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 83, + 1536 + ] + }, + "84": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 84, + 1536 + ] + }, + "85": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 85, + 1536 + ] + }, + "86": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 86, + 1536 + ] + }, + "87": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 87, + 1536 + ] + }, + "88": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 88, + 1536 + ] + }, + "89": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 89, + 1536 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 1536 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 1536 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 1536 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 1536 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 1536 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 1536 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 1536 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 1536 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 1536 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 1536 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 1536 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 1536 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 1536 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 1536 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 1536 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 1536 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 1536 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 1536 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 1536 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 1536 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 1536 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 1536 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 1536 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 1536 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 1536 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 1536 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 1536 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 1536 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 1536 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 1536 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 1536 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 1536 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 1536 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 1536 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 1536 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 1536 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 1536 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 1536 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 1536 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..1c9b82958695709d2737525a95a795d59d4a874b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=1536,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 1536 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 1536 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 1536 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 1536 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 1536 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 1536 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 1536 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 1536 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 1536 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 1536 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 1536 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 1536 + ] + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 1536 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 1536 + ] + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 1536 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 1536 + ] + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 1536 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 1536 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 1536 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 1536 + ] + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 1536 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 1536 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 1536 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 1536 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 1536 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 1536 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 1536 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 1536 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 1536 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 1536 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 1536 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 1536 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 1536 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 1536 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 1536 + ] + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 1536 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 1536 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 1536 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 1536 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 1536 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 1536 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 1536 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 1536 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 1536 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 1536 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 1536 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 1536 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 1536 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 1536 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 1536 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 1536 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 1536 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 1536 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 1536 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 1536 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 1536 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 1536 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 1536 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 1536 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 1536 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 1536 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 1536 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 1536 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 1536 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 1536 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 1536 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 1536 + ] + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 1536 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 1536 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 1536 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 1536 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 1536 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 1536 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 1536 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 1536 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 1536 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 1536 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 1536 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 1536 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 1536 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 1536 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 1536 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 1536 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 1536 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 1536 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 1536 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 1536 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 1536 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 1536 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 1536 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 1536 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 1536 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 1536 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 1536 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 1536 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 1536 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 1536 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 1536 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 1536 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 1536 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 1536 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 1536 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 1536 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 1536 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 1536 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 1536 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 1536 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 1536 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 1536 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 1536 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 1536 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 1536 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 1536 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 1536 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 1536 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 1536 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 1536 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 1536 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 1536 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 1536 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 1536 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 1536 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 1536 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 1536 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 1536 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 1536 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 1536 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 1536 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=2112,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=2112,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..9348bdd22b6ce773e5324bd6e39e28e663938480 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=2112,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 2112 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 2112 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 2112 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 2112 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 2112 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 2112 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 2112 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 2112 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 2112 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 2112 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 2112 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 2112 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 2112 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 2112 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 2112 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 2112 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 2112 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 2112 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 2112 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 2112 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 2112 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 2112 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 2112 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 2112 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 2112 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 2112 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 2112 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 2112 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 2112 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 2112 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 2112 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 2112 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 2112 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 2112 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 2112 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 2112 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 2112 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 2112 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 2112 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 2112 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 2112 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 2112 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 2112 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 2112 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 2112 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 2112 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 2112 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 2112 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 2112 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 2112 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 2112 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 2112 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 2112 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 2112 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 2112 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 2112 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 2112 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 2112 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 2112 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 2112 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 2112 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 2112 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 2112 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 2112 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 2112 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 2112 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 2112 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 2112 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 2112 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 2112 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 2112 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 2112 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 2112 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 2112 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 2112 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 2112 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 2112 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 2112 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 2112 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 2112 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 2112 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 2112 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 2112 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 2112 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 2112 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 2112 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 2112 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 2112 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 2112 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 2112 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 2112 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 2112 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 2112 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 2112 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 2112 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 2112 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 2112 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 2112 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 2112 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 2112 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 2112 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 2112 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 2112 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 2112 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 2112 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 2112 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 2112 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 2112 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 2112 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 2112 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 2112 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 2112 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 2112 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 2112 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 2112 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 2112 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 2112 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 2112 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 2112 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 2112 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 2112 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 2112 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 2112 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 2112 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 2112 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 2112 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 2112 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 2112 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..45a0b433653ce3cf96de0db80ad6919cd5bd2f56 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2629 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 3072 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 3072 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 3072 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 3072 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 3072 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 3072 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 3072 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 3072 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 3072 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 3072 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 3072 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 3072 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 3072 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 3072 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 3072 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 3072 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 3072 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 3072 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 3072 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 3072 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 3072 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 3072 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 3072 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 3072 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 3072 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 3072 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 3072 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 3072 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 43, + 3072 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 44, + 3072 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 45, + 3072 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 46, + 3072 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 3072 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 3072 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 3072 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 3072 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 3072 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 3072 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 3072 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 3072 + ] + }, + "55": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 55, + 3072 + ] + }, + "56": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 3072 + ] + }, + "57": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 57, + 3072 + ] + }, + "58": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 3072 + ] + }, + "59": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 59, + 3072 + ] + }, + "60": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 60, + 3072 + ] + }, + "61": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 61, + 3072 + ] + }, + "62": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 3072 + ] + }, + "63": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 63, + 3072 + ] + }, + "64": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 3072 + ] + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 3072 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 66, + 3072 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 3072 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 3072 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 3072 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 3072 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 71, + 3072 + ] + }, + "72": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 72, + 3072 + ] + }, + "73": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 73, + 3072 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 3072 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 3072 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 3072 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 3072 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 3072 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 3072 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 3072 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 3072 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 3072 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 3072 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 3072 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 3072 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 3072 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 3072 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 3072 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 3072 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 3072 + ] + }, + "92": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 92, + 3072 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 3072 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 3072 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 3072 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 3072 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 3072 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 3072 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 3072 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..ca74ca2cb5ff7d83d5b853dd45e60bf4c06ded21 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2616 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 3072 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 3072 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 3072 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 3072 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 3072 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 3072 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 3072 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 3072 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 3072 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 3072 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 3072 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 3072 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 3072 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 3072 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 3072 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 3072 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 3072 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 3072 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 3072 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 3072 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 3072 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 3072 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 3072 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 3072 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 3072 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 3072 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 3072 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 3072 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 3072 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 3072 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 3072 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 3072 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 3072 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 3072 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 3072 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 3072 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 3072 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 3072 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 3072 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 3072 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 3072 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 3072 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 3072 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 3072 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 3072 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 3072 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 3072 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 3072 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 3072 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 3072 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 3072 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 3072 + ] + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 3072 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 3072 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 3072 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 3072 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 3072 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 3072 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 3072 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 3072 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 3072 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 3072 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 3072 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 3072 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 3072 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 3072 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 3072 + ] + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 3072 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 3072 + ] + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 3072 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 3072 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 3072 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 3072 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 3072 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 3072 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 3072 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 3072 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 3072 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 3072 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 3072 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 3072 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 3072 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 3072 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 3072 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 3072 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 3072 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 3072 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 3072 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 3072 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 3072 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 3072 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 3072 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 3072 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 3072 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 3072 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 3072 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 3072 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..308424d5912e45c4d49d23575533e096b494db74 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2622 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 3072 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 3072 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 3072 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 3072 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 3072 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 3072 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 3072 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 3072 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 3072 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 3072 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 3072 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 3072 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 3072 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 3072 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 3072 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 3072 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 3072 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 3072 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 3072 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 3072 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 3072 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 3072 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 3072 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 3072 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 3072 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 3072 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 3072 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 3072 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 3072 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 43, + 3072 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 44, + 3072 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 45, + 3072 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 46, + 3072 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 3072 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 3072 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 3072 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 3072 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 3072 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 3072 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 3072 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 3072 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 3072 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 3072 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 3072 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 3072 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 3072 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 60, + 3072 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 3072 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 3072 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 3072 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 3072 + ] + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 65, + 3072 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 66, + 3072 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 67, + 3072 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 68, + 3072 + ] + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 69, + 3072 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 3072 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 3072 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 3072 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 3072 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 3072 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 3072 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 3072 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 3072 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 3072 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 3072 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 3072 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 3072 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 3072 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 3072 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 3072 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 3072 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 3072 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 3072 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 3072 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 3072 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 3072 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 3072 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 3072 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 3072 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 3072 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 3072 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 3072 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 3072 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 3072 + ] + }, + "120": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..418ae287348340a918f0af07fecc7932ee01b2fe --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2594 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 3072 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 3072 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 3072 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 3072 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 3072 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 3072 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 3072 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 3072 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 3072 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 3072 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 3072 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 3072 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 3072 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 3072 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 3072 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 3072 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 3072 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 3072 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 3072 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 3072 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 3072 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 3072 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 3072 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 3072 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 3072 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 3072 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 3072 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 3072 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 3072 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 3072 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 3072 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 3072 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 3072 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 3072 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 3072 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 3072 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 3072 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 3072 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 3072 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 3072 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 3072 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 3072 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 3072 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 3072 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 3072 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 3072 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 3072 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 3072 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 3072 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 3072 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 3072 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 3072 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 3072 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 3072 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 3072 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 3072 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 3072 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 3072 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 3072 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 3072 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 3072 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 3072 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 3072 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 3072 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 3072 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 3072 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 3072 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 3072 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 3072 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 3072 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 3072 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 3072 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 3072 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 3072 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 3072 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 3072 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 3072 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 3072 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 3072 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 3072 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 3072 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 3072 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 3072 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 3072 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 3072 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 3072 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 3072 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 3072 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 3072 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 3072 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 3072 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..87a29702a74f61c9e4c14f09ea4dbfde48a3c78f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2622 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 3072 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 3072 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 3072 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 3072 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 3072 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 3072 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 3072 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 3072 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 3072 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 3072 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 3072 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 3072 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 3072 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 3072 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 3072 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 3072 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 3072 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 31, + 3072 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 32, + 3072 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 3072 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 3072 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 3072 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 3072 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 3072 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 3072 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 3072 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 3072 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 3072 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 3072 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 43, + 3072 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 44, + 3072 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 45, + 3072 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 46, + 3072 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 3072 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 3072 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 3072 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 3072 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 3072 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 3072 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 3072 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 3072 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 3072 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 3072 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 3072 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 3072 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 3072 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 60, + 3072 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 3072 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 3072 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 3072 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 3072 + ] + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 65, + 3072 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 66, + 3072 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 67, + 3072 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 68, + 3072 + ] + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 69, + 3072 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 3072 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 3072 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 3072 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 3072 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 3072 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 3072 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 3072 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 3072 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 3072 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 3072 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 3072 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 3072 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 3072 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 83, + 3072 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 3072 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 3072 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 3072 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 3072 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 3072 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 3072 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 3072 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 3072 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 3072 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 3072 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 3072 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 3072 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 3072 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 3072 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 3072 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..6ed749044427b75e2b3060ef0fa0feda04089f16 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2641 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 3072 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 3072 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 3072 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 3072 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 3072 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 3072 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 3072 + ] + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 3072 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 3072 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 3072 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 3072 + ] + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 3072 + ] + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 3072 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 3072 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 3072 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 3072 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 3072 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 3072 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 3072 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 3072 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 3072 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 3072 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 3072 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 3072 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 3072 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 3072 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 3072 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 3072 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 3072 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 3072 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 3072 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 3072 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 3072 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 3072 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 3072 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 3072 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 3072 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 3072 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 3072 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 3072 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 3072 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 3072 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 3072 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 3072 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 3072 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 3072 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 3072 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 3072 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 3072 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 3072 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 3072 + ] + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 3072 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 3072 + ] + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 3072 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 3072 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 3072 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 3072 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 3072 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 3072 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 3072 + ] + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 3072 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 3072 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 3072 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 3072 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 3072 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 3072 + ] + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 3072 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 3072 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 3072 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 3072 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 3072 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 3072 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 3072 + ] + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 3072 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 3072 + ] + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 3072 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 3072 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 3072 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 3072 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 3072 + ] + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 3072 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 3072 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 3072 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 3072 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=256,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=256,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..c9cdd0fbe71f893a60b1e26867de580db2246b2c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=256,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2593 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 3072 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 3072 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 3072 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 3072 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 3072 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 3072 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 3072 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 3072 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 3072 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 3072 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 3072 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 3072 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 3072 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 3072 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 3072 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 3072 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 3072 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 3072 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 3072 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 3072 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 3072 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 3072 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 3072 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 3072 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 3072 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 3072 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 3072 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 3072 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 3072 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 3072 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 3072 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 3072 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 3072 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 3072 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 3072 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 3072 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 3072 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 3072 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 3072 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 3072 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 3072 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 3072 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 3072 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 3072 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 3072 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 3072 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 3072 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 3072 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 50, + 3072 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 3072 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 3072 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 3072 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 3072 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 55, + 3072 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 3072 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 3072 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 3072 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 3072 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 3072 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 3072 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 3072 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 3072 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 3072 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 3072 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 3072 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 3072 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 3072 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 3072 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 3072 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 3072 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 3072 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 3072 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 3072 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 3072 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 3072 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 3072 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 3072 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 3072 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 3072 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 3072 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 3072 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 3072 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 3072 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 3072 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 3072 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 3072 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 3072 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 3072 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 3072 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 3072 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 3072 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 3072 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 3072 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 3072 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 3072 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 3072 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 3072 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 3072 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 3072 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..8f6a271f430d751b4c7215c139f00e5558a42008 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2624 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 3072 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 3072 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 3072 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 3072 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 3072 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 3072 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 3072 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 3072 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 3072 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 3072 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 3072 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 3072 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 3072 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 3072 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 3072 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 3072 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 3072 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 3072 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 3072 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 3072 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 3072 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 3072 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 3072 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 3072 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 3072 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 3072 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 3072 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 3072 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 3072 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 3072 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 3072 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 3072 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 3072 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 3072 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 3072 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 3072 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 3072 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 3072 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 3072 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 3072 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 3072 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 3072 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 3072 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 3072 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 3072 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 3072 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 3072 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 3072 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 3072 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 3072 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 3072 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 3072 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 3072 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 3072 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 3072 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 3072 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 3072 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 3072 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 3072 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 3072 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 3072 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 3072 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 3072 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 3072 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 3072 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 3072 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 3072 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 3072 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 3072 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 3072 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 3072 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 3072 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 3072 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 3072 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 3072 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 3072 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 3072 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 3072 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 3072 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 3072 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 3072 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 3072 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 3072 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 3072 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 3072 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 3072 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 3072 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 3072 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 3072 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 3072 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 3072 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 3072 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=512,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=512,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..10d779a05e797c6d38d23b61346545dd36634bfd --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=512,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2626 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 3072 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 3072 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 3072 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 3072 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 3072 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 3072 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 3072 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 3072 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 3072 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 3072 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 3072 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 3072 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 3072 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 3072 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 3072 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 3072 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 3072 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 3072 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 3072 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 3072 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 3072 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 3072 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 3072 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 3072 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 3072 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 3072 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 3072 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 3072 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 3072 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 3072 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 3072 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 3072 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 3072 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 3072 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 3072 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 3072 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 3072 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 3072 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 3072 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 3072 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 3072 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 3072 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 3072 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 3072 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 3072 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 3072 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 3072 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 49, + 3072 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 3072 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 3072 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 3072 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 3072 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 3072 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 3072 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 3072 + ] + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 3072 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 3072 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 3072 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 3072 + ] + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 3072 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 3072 + ] + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 63, + 3072 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 3072 + ] + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 3072 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 3072 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 3072 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 3072 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 3072 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 3072 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 3072 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 3072 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 3072 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 3072 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 3072 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 3072 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 3072 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 3072 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 3072 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 3072 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 3072 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 3072 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 3072 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 3072 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 3072 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 3072 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 3072 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 3072 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 3072 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 3072 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 3072 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 3072 + ] + }, + "94": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 3072 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 3072 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 3072 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 3072 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 3072 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 3072 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 3072 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 3072 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 3072 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 3072 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 3072 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 3072 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..0e433b413549eb0be750aeee6a712d8a5123e3b2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2591 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 3072 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 3072 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 3072 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 3072 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 3072 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 3072 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 3072 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 3072 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 3072 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 3072 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 3072 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 3072 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 3072 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 3072 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 3072 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 3072 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 3072 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 3072 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 3072 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 3072 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 3072 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 3072 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 3072 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 3072 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 3072 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 3072 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 3072 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 3072 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 3072 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 3072 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 3072 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 3072 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 3072 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 3072 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 3072 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 3072 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 3072 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 3072 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 3072 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 3072 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 3072 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 3072 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 3072 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 3072 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 3072 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 3072 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 3072 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 3072 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 3072 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 3072 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 3072 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 3072 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 3072 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 3072 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 3072 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 3072 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 3072 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 3072 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 3072 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 3072 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 3072 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 3072 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 3072 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 3072 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 3072 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 3072 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 3072 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 3072 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 3072 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 3072 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 3072 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 3072 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 3072 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 3072 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 3072 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 3072 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 3072 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 3072 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 3072 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 3072 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 3072 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 3072 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 3072 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 3072 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 3072 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 3072 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 3072 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 3072 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 3072 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 3072 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 3072 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 3072 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 3072 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 3072 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 3072 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 3072 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 3072 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 3072 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 3072 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 3072 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 3072 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 3072 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 3072 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 3072 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 3072 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 3072 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 3072 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 3072 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 3072 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 3072 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 3072 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 3072 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 3072 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..b948526650093b1c5cf971abca0ce12b68d58070 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2634 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 3072 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 3072 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 3072 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 3072 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 3072 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 3072 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 3072 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 3072 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 3072 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 3072 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 3072 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 3072 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 3072 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 3072 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 3072 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 3072 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 3072 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 3072 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 3072 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 3072 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 3072 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 3072 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 3072 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 3072 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 3072 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 3072 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 3072 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 3072 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 3072 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 3072 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 3072 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 3072 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 3072 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 3072 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 3072 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 3072 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 3072 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 3072 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 3072 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 3072 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 3072 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 3072 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 3072 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 3072 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 3072 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 3072 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 3072 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 3072 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 60, + 3072 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 3072 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 3072 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 3072 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 3072 + ] + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 65, + 3072 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 66, + 3072 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 67, + 3072 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 68, + 3072 + ] + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 69, + 3072 + ] + }, + "70": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 70, + 3072 + ] + }, + "71": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 71, + 3072 + ] + }, + "72": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 72, + 3072 + ] + }, + "73": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 73, + 3072 + ] + }, + "74": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 74, + 3072 + ] + }, + "75": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 75, + 3072 + ] + }, + "76": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 76, + 3072 + ] + }, + "77": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 77, + 3072 + ] + }, + "78": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 78, + 3072 + ] + }, + "79": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 79, + 3072 + ] + }, + "80": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 80, + 3072 + ] + }, + "81": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 81, + 3072 + ] + }, + "82": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 82, + 3072 + ] + }, + "83": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 83, + 3072 + ] + }, + "84": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 84, + 3072 + ] + }, + "85": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 85, + 3072 + ] + }, + "86": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 86, + 3072 + ] + }, + "87": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 87, + 3072 + ] + }, + "88": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 88, + 3072 + ] + }, + "89": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 89, + 3072 + ] + }, + "90": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 90, + 3072 + ] + }, + "91": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 91, + 3072 + ] + }, + "92": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 92, + 3072 + ] + }, + "93": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 93, + 3072 + ] + }, + "94": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 94, + 3072 + ] + }, + "95": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 95, + 3072 + ] + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 96, + 3072 + ] + }, + "97": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 97, + 3072 + ] + }, + "98": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 98, + 3072 + ] + }, + "99": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 99, + 3072 + ] + }, + "100": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 100, + 3072 + ] + }, + "101": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 101, + 3072 + ] + }, + "102": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 102, + 3072 + ] + }, + "103": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 103, + 3072 + ] + }, + "104": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 104, + 3072 + ] + }, + "105": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 105, + 3072 + ] + }, + "106": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 106, + 3072 + ] + }, + "107": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 107, + 3072 + ] + }, + "108": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 108, + 3072 + ] + }, + "109": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 109, + 3072 + ] + }, + "110": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 110, + 3072 + ] + }, + "111": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 111, + 3072 + ] + }, + "112": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 112, + 3072 + ] + }, + "113": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 113, + 3072 + ] + }, + "114": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 114, + 3072 + ] + }, + "115": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 115, + 3072 + ] + }, + "116": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 116, + 3072 + ] + }, + "117": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 117, + 3072 + ] + }, + "118": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 118, + 3072 + ] + }, + "119": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 119, + 3072 + ] + }, + "120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 120, + 3072 + ] + }, + "121": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 121, + 3072 + ] + }, + "122": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 122, + 3072 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 3072 + ] + }, + "124": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 124, + 3072 + ] + }, + "125": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 125, + 3072 + ] + }, + "126": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 126, + 3072 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 3072 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 3072 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..065551d4af67c324f7d898038cabc8cba6b33e6b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=3072,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 3072 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 3072 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 3072 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 3072 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 3072 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 3072 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 3072 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 3072 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 3072 + ] + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 3072 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 3072 + ] + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 3072 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 3072 + ] + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 3072 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 3072 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 3072 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 3072 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 3072 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 3072 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 3072 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 3072 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 3072 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 3072 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 3072 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 3072 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 3072 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 3072 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 3072 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 3072 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 3072 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 3072 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 3072 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 3072 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 3072 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 3072 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 3072 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 3072 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 3072 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 3072 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 3072 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 3072 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 3072 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 3072 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 3072 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 3072 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 3072 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 3072 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 3072 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 3072 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 3072 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 3072 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 3072 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 3072 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 3072 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 3072 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 3072 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 3072 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 3072 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 3072 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 3072 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 3072 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 3072 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 3072 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 3072 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 3072 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 3072 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 3072 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 3072 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 3072 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 3072 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 3072 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 3072 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 3072 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 3072 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 3072 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 3072 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 3072 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 3072 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 3072 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 3072 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 3072 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 3072 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 3072 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 3072 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 3072 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 3072 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 3072 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 3072 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 3072 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 3072 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 3072 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 3072 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 3072 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 3072 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 3072 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 3072 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 3072 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 3072 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 3072 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 3072 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 3072 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 3072 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 3072 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 3072 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 3072 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 3072 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 3072 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 3072 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 3072 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 3072 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 3072 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 3072 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 3072 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 3072 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 3072 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 3072 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 3072 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 3072 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 3072 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 3072 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 3072 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 3072 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 3072 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 3072 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 3072 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 3072 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 3072 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 3072 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..e28c630541e4101ffba57a2bee4bbe141e9e72a4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2609 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4096 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4096 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 4096 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4096 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 4096 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 4096 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 4096 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 4096 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 4096 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 4096 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4096 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4096 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4096 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4096 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 4096 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 4096 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 4096 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 4096 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 4096 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 4096 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 4096 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 4096 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4096 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4096 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4096 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4096 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4096 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4096 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 4096 + ], + "D_DTYPE": 16 + }, + "52": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 52, + 4096 + ], + "D_DTYPE": 16 + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 4096 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 4096 + ], + "D_DTYPE": 16 + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 4096 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 4096 + ], + "D_DTYPE": 16 + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4096 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4096 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4096 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4096 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 4096 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4096 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4096 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4096 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4096 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4096 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4096 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4096 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4096 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4096 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4096 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4096 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4096 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4096 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4096 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4096 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4096 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4096 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4096 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4096 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4096 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4096 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..8686e9f48b0217a613beb12a930b95672edda877 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2624 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 4096 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 4096 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4096 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4096 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4096 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4096 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4096 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4096 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4096 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4096 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4096 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4096 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4096 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 4096 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 4096 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 4096 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 4096 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 4096 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 4096 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 4096 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 4096 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4096 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 4096 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 4096 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4096 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4096 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 4096 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 4096 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 4096 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 4096 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 4096 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 4096 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 4096 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 4096 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 4096 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 4096 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 4096 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 4096 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 4096 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4096 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4096 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 4096 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 4096 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4096 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 4096 + ] + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4096 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4096 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4096 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4096 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4096 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4096 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4096 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4096 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4096 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4096 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4096 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4096 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4096 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4096 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4096 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4096 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4096 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4096 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4096 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4096 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4096 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4096 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4096 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4096 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4096 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4096 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4096 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4096 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4096 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4096 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4096 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4096 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4096 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4096 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..8fe96b74a891312307d557f1cc3ae0c9fcbbb631 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2613 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4096 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4096 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4096 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4096 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4096 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4096 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4096 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4096 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4096 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4096 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4096 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4096 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4096 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4096 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 4096 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 4096 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 4096 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 4096 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 4096 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 4096 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 4096 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 4096 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 4096 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 4096 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 43, + 4096 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 44, + 4096 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 45, + 4096 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 46, + 4096 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 4096 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 4096 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 4096 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 4096 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4096 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4096 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 4096 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4096 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 4096 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 4096 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4096 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4096 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 4096 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4096 + ], + "D_DTYPE": 16 + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 4096 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4096 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4096 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4096 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4096 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4096 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4096 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4096 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4096 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 100, + 4096 + ] + }, + "101": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 109, + 4096 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 110, + 4096 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4096 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4096 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4096 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4096 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4096 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4096 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4096 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4096 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4096 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4096 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4096 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4096 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4096 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..eb141ff93c2737904c31c4347ac180df6f234f56 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2594 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 7, + 4096 + ], + "D_DTYPE": 16 + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 4096 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 4096 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 4096 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 4096 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 4096 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 4096 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 4096 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4096 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4096 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4096 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4096 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4096 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4096 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4096 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4096 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 4096 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 4096 + ] + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4096 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 4096 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 4096 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 4096 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 4096 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 4096 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 4096 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4096 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 4096 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 4096 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4096 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 4096 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 4096 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 4096 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 4096 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4096 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4096 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4096 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 4096 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4096 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 4096 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4096 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4096 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4096 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4096 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4096 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4096 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4096 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4096 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4096 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 4096 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4096 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4096 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4096 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4096 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4096 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4096 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4096 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4096 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4096 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4096 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4096 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4096 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4096 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4096 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4096 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4096 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4096 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4096 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4096 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4096 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4096 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4096 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4096 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4096 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4096 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4096 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4096 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4096 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4096 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4096 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4096 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..0e20556e955358d9d692116994b108fa2f5f1f1d --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2629 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4096 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4096 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4096 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4096 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4096 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4096 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4096 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4096 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4096 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4096 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4096 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4096 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4096 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 30, + 4096 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 31, + 4096 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 32, + 4096 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 4096 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 4096 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 4096 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 4096 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 4096 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 4096 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 4096 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 4096 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 4096 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 4096 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 43, + 4096 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 44, + 4096 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 45, + 4096 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 46, + 4096 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 4096 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 4096 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 4096 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 4096 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4096 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4096 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 4096 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 4096 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4096 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 4096 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 4096 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 4096 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4096 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4096 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 4096 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 4096 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4096 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 4096 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4096 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4096 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4096 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 88, + 4096 + ] + }, + "89": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 89, + 4096 + ] + }, + "90": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 90, + 4096 + ] + }, + "91": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 91, + 4096 + ] + }, + "92": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 92, + 4096 + ] + }, + "93": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 93, + 4096 + ] + }, + "94": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 94, + 4096 + ] + }, + "95": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4096 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4096 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4096 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4096 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4096 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4096 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4096 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4096 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4096 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4096 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4096 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4096 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4096 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4096 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4096 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4096 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..bf02183e010ef07eee3a2b45b2fe565d72f6e3d2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2600 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4096 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4096 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4096 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4096 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4096 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4096 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4096 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4096 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4096 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4096 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4096 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4096 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4096 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 4096 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 4096 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 4096 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 4096 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 4096 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 4096 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 4096 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 4096 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4096 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 4096 + ] + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 4096 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4096 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4096 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 4096 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 4096 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 4096 + ] + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 4096 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4096 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 4096 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 4096 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 4096 + ] + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 4096 + ] + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4096 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4096 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 4096 + ] + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4096 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 4096 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 4096 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 4096 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4096 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 4096 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4096 + ] + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 4096 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4096 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4096 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4096 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4096 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4096 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4096 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4096 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4096 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4096 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4096 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4096 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4096 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4096 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4096 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4096 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4096 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4096 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4096 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4096 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4096 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4096 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4096 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4096 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 72, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4096 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4096 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4096 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4096 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4096 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4096 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4096 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4096 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4096 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4096 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=256,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=256,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..32ae8a0f6f8ed4d500ed3588516103e12d538905 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=256,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2620 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 4096 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 4096 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 4096 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 4096 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 4096 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 4096 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 4096 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 4096 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 4096 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 4096 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 4096 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 4096 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 4096 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 4096 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 4096 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 4096 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 4096 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 4096 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 4096 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 4096 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 4096 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4096 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4096 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4096 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4096 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 4096 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 4096 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 4096 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 4096 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 4096 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 4096 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 4096 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 4096 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4096 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4096 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4096 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4096 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4096 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4096 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 4096 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4096 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4096 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 4096 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4096 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 4096 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4096 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4096 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4096 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4096 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 4096 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4096 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4096 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4096 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4096 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4096 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4096 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4096 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4096 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4096 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4096 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4096 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4096 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4096 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4096 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4096 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4096 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4096 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4096 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4096 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4096 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4096 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..a5c8bcdec9c183378d9b3f32158aed00d7c7ae4c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2612 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 4096 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4096 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 4096 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 4096 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 4096 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 4096 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 4096 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 4096 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 4096 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 4096 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4096 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 4096 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4096 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 4096 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 4096 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4096 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 4096 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 4096 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4096 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 4096 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 4096 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4096 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4096 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 4096 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4096 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 4096 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 4096 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4096 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 4096 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 4096 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 4096 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 4096 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4096 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4096 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 4096 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 4096 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4096 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4096 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 4096 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4096 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4096 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4096 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4096 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 4096 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4096 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4096 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 4096 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 4096 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 4096 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 4096 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 4096 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 4096 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4096 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 4096 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4096 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 4096 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4096 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4096 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 4096 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 4096 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4096 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4096 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 4096 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4096 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4096 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4096 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4096 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4096 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4096 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4096 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4096 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4096 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4096 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4096 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4096 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4096 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4096 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4096 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4096 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4096 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4096 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4096 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=512,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=512,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..b2d9bbd3e2ca12677918e946cb068bcc29655176 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=512,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2628 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 4096 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 4096 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 4096 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 4096 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 4096 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 4096 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 4096 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 4096 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 4096 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 4096 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 4096 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 4096 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 4096 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 4096 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 4096 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 4096 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 17, + 4096 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 18, + 4096 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 19, + 4096 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 20, + 4096 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 4096 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4096 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4096 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 24, + 4096 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 30, + 4096 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 4096 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 4096 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 4096 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 4096 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 4096 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 38, + 4096 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 4096 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 40, + 4096 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4096 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4096 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4096 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4096 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4096 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4096 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 51, + 4096 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4096 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 4096 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 4096 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 4096 + ] + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 4096 + ] + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 4096 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 4096 + ] + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 4096 + ] + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 4096 + ] + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4096 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4096 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4096 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4096 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 4096 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4096 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4096 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4096 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4096 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4096 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4096 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4096 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4096 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4096 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4096 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4096 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4096 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4096 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4096 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4096 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4096 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4096 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4096 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4096 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4096 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4096 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4096 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4096 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4096 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4096 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4096 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4096 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4096 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4096 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4096 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..80538525ab604b338a431bd0c682b8cefe353342 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2591 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4096 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4096 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4096 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4096 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4096 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4096 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4096 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 4096 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 4096 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 4096 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 4096 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4096 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 4096 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4096 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4096 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 4096 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 4096 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4096 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4096 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 4096 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 4096 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 4096 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 4096 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 4096 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 4096 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 4096 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 4096 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 4096 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4096 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4096 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4096 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 4096 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4096 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 4096 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4096 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4096 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4096 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4096 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4096 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 4096 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 4096 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4096 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4096 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4096 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 4096 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 4096 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4096 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4096 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 4096 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 4096 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 4096 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4096 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 4096 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4096 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 4096 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4096 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4096 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 4096 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4096 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 4096 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4096 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4096 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4096 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4096 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4096 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4096 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4096 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4096 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4096 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4096 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4096 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4096 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4096 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4096 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4096 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4096 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4096 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4096 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4096 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4096 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4096 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4096 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 480, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4096 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4096 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..a61607b9c99ec358b70a026cfc17bfb50da05285 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2604 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4096 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4096 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4096 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4096 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4096 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4096 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4096 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4096 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4096 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4096 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4096 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4096 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4096 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4096 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4096 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 4096 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 4096 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 4096 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 4096 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 4096 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 4096 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 4096 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 4096 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 33, + 4096 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 34, + 4096 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 4096 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 4096 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 4096 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 4096 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 4096 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 4096 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 4096 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 4096 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 43, + 4096 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 44, + 4096 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 45, + 4096 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 46, + 4096 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 4096 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 4096 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 4096 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 4096 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4096 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4096 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 4096 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 4096 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4096 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 4096 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 4096 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 4096 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4096 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 60, + 4096 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4096 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 4096 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 4096 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 4096 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4096 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4096 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4096 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4096 + ] + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4096 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4096 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4096 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4096 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4096 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4096 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4096 + ] + }, + "76": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 76, + 4096 + ] + }, + "77": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 77, + 4096 + ] + }, + "78": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 78, + 4096 + ] + }, + "79": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 79, + 4096 + ] + }, + "80": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 80, + 4096 + ] + }, + "81": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 81, + 4096 + ] + }, + "82": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 82, + 4096 + ] + }, + "83": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 83, + 4096 + ] + }, + "84": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 84, + 4096 + ] + }, + "85": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 85, + 4096 + ] + }, + "86": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 86, + 4096 + ] + }, + "87": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 87, + 4096 + ] + }, + "88": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 88, + 4096 + ] + }, + "89": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 89, + 4096 + ] + }, + "90": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 90, + 4096 + ] + }, + "91": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 91, + 4096 + ] + }, + "92": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 92, + 4096 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4096 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4096 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4096 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4096 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4096 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4096 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4096 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4096 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4096 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4096 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4096 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4096 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4096 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4096 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4096 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4096 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4096 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4096 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4096 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4096 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4096 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4096 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4096 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4096 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4096 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4096 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4096 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4096 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4096 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4096 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4096 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4096 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4096 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4096 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4096 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4096 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..29bb0f413875a179aec29942d45686adf25bd768 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4096,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4096 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4096 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4096 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4096 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4096 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4096 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4096 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4096 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4096 + ] + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 4096 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 4096 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 4096 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 4096 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 4096 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 4096 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 4096 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 4096 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 4096 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 4096 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 4096 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 4096 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 4096 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 4096 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 4096 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 4096 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 4096 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4096 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4096 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4096 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4096 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4096 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4096 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 4096 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 4096 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 4096 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 4096 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 4096 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 4096 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4096 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 4096 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 4096 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4096 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 4096 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 4096 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 4096 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4096 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4096 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 4096 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4096 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4096 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4096 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4096 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4096 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4096 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 4096 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4096 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4096 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4096 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 4096 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4096 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4096 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4096 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4096 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4096 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 4096 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 4096 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 4096 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 4096 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 4096 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 4096 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 4096 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 4096 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4096 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 4096 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 4096 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 4096 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4096 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4096 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 4096 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 4096 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 4096 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 4096 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 4096 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 4096 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 4096 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4096 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 4096 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4096 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 4096 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4096 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4096 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 4096 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 4096 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4096 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4096 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 4096 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 4096 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4096 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4096 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4096 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4096 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4096 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4096 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4096 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4096 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4096 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4096 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4096 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4096 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4096 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4096 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4096 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4096 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4096 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4096 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4096 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4096 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4096 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4096 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4096 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4096 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4096 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4096 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4096 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4096 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4096 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4096 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4096 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..f71f4f6548dfb79cf2132777af230724de1b7f99 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2620 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 4608 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 4608 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 4608 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 4608 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 4608 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 4608 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4608 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 4608 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 4608 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 4608 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 4608 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 4608 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 4608 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 4608 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 4608 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 4608 + ], + "D_DTYPE": 16 + }, + "52": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 52, + 4608 + ], + "D_DTYPE": 16 + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 4608 + ], + "D_DTYPE": 16 + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 4608 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 4608 + ], + "D_DTYPE": 16 + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 4608 + ], + "D_DTYPE": 16 + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4608 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4608 + ] + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 70, + 4608 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 73, + 4608 + ] + }, + "74": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 74, + 4608 + ] + }, + "75": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 76, + 4608 + ] + }, + "77": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 77, + 4608 + ] + }, + "78": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 78, + 4608 + ] + }, + "79": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4608 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4608 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4608 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4608 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4608 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4608 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4608 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4608 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4608 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4608 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4608 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4608 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 105, + 4608 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4608 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4608 + ] + }, + "109": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 109, + 4608 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4608 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 111, + 4608 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4608 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4608 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4608 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4608 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4608 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4608 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4608 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4608 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4608 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4608 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4608 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4608 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4608 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4608 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4608 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4608 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4608 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..a2a2635b36f1a6cf6576e1e5455b7bf2a10416ee --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2648 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4608 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4608 + ] + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 4608 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4608 + ] + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 4608 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4608 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4608 + ] + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4608 + ] + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 4608 + ] + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4608 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 4608 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 4608 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4608 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4608 + ] + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 4608 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 4608 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 4608 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 4608 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 4608 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 4608 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 4608 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 4608 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 4608 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 4608 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 4608 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4608 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4608 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 4608 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 4608 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4608 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 4608 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 4608 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 4608 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4608 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 4608 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4608 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 4608 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 4608 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 4608 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4608 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4608 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4608 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4608 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4608 + ] + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 4608 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4608 + ] + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 4608 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 4608 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4608 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4608 + ] + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4608 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 4608 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 4608 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 4608 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4608 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4608 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4608 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4608 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4608 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4608 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4608 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4608 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4608 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4608 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4608 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4608 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4608 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4608 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4608 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4608 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4608 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4608 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4608 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4608 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4608 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4608 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4608 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4608 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4608 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4608 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4608 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4608 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4608 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4608 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4608 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4608 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4608 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4608 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..b708f8fed360aea50618041ee5b16ff317045848 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2618 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4608 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4608 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4608 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4608 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4608 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4608 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 4608 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 4608 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 4608 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 4608 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 4608 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 51, + 4608 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4608 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 55, + 4608 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 57, + 4608 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 59, + 4608 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 61, + 4608 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4608 + ], + "D_DTYPE": 16 + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 4608 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4608 + ] + }, + "71": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 73, + 4608 + ] + }, + "74": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 74, + 4608 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4608 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4608 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4608 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4608 + ] + }, + "82": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 86, + 4608 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 88, + 4608 + ] + }, + "89": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 90, + 4608 + ] + }, + "91": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 91, + 4608 + ] + }, + "92": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 93, + 4608 + ] + }, + "94": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 95, + 4608 + ] + }, + "96": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 97, + 4608 + ] + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 99, + 4608 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 100, + 4608 + ] + }, + "101": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 101, + 4608 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 102, + 4608 + ] + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 105, + 4608 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 107, + 4608 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 108, + 4608 + ] + }, + "109": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 109, + 4608 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 110, + 4608 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 111, + 4608 + ] + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 112, + 4608 + ] + }, + "113": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 113, + 4608 + ] + }, + "114": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 114, + 4608 + ] + }, + "115": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 115, + 4608 + ] + }, + "116": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 116, + 4608 + ] + }, + "117": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 117, + 4608 + ] + }, + "118": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 118, + 4608 + ] + }, + "119": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 119, + 4608 + ] + }, + "120": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 120, + 4608 + ] + }, + "121": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 121, + 4608 + ] + }, + "122": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 122, + 4608 + ] + }, + "123": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 123, + 4608 + ] + }, + "124": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 124, + 4608 + ] + }, + "125": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 125, + 4608 + ] + }, + "126": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 126, + 4608 + ] + }, + "127": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 127, + 4608 + ] + }, + "128": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 128, + 4608 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..85d93cb3dbc4a1169a1cd8749f52c72b54f4a7bd --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2597 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 4608 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 4608 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 4608 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 4608 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 4608 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 4608 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 4608 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 4608 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 4608 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 4608 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 4608 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 4608 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 4608 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 4608 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4608 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4608 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 4608 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4608 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 4608 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4608 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4608 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4608 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4608 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4608 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4608 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4608 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4608 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4608 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4608 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4608 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4608 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4608 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4608 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4608 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4608 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4608 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4608 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4608 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4608 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4608 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4608 + ] + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4608 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4608 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4608 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4608 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4608 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4608 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4608 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4608 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4608 + ] + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4608 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4608 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4608 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4608 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4608 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4608 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4608 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4608 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4608 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4608 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4608 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4608 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4608 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4608 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4608 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4608 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..3841b09e5b2485687fcfa3a60f90b2b6ee656531 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2617 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4608 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4608 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4608 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4608 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4608 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4608 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 4608 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4608 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 4608 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 4608 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 4608 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 4608 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4608 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4608 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 55, + 4608 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 57, + 4608 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 59, + 4608 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 61, + 4608 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 63, + 4608 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4608 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 4608 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4608 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4608 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4608 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4608 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4608 + ] + }, + "78": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 78, + 4608 + ] + }, + "79": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4608 + ] + }, + "82": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 86, + 4608 + ] + }, + "87": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 88, + 4608 + ] + }, + "89": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 90, + 4608 + ] + }, + "91": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 91, + 4608 + ] + }, + "92": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 93, + 4608 + ] + }, + "94": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 95, + 4608 + ] + }, + "96": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 97, + 4608 + ] + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 99, + 4608 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 100, + 4608 + ] + }, + "101": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 101, + 4608 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 102, + 4608 + ] + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 105, + 4608 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 107, + 4608 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 108, + 4608 + ] + }, + "109": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 109, + 4608 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 110, + 4608 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 111, + 4608 + ] + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 112, + 4608 + ] + }, + "113": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 113, + 4608 + ] + }, + "114": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 114, + 4608 + ] + }, + "115": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 115, + 4608 + ] + }, + "116": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 116, + 4608 + ] + }, + "117": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 117, + 4608 + ] + }, + "118": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 118, + 4608 + ] + }, + "119": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 119, + 4608 + ] + }, + "120": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 120, + 4608 + ] + }, + "121": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 121, + 4608 + ] + }, + "122": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 122, + 4608 + ] + }, + "123": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 123, + 4608 + ] + }, + "124": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 124, + 4608 + ] + }, + "125": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 125, + 4608 + ] + }, + "126": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 126, + 4608 + ] + }, + "127": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 127, + 4608 + ] + }, + "128": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 128, + 4608 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..ef63b9c1d6d03bd65c973b9bf5b0fc7cbb67fef0 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2625 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4608 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 4608 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 4608 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 4608 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4608 + ] + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4608 + ] + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 4608 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4608 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4608 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4608 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4608 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 4608 + ] + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 4608 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 4608 + ] + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 4608 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 4608 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4608 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 4608 + ] + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4608 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4608 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 4608 + ] + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 4608 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 4608 + ] + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 4608 + ] + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 4608 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 4608 + ] + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4608 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4608 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 4608 + ] + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4608 + ] + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4608 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4608 + ] + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4608 + ] + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 4608 + ] + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4608 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4608 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 4608 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 4608 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4608 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4608 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4608 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4608 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4608 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4608 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4608 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4608 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4608 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4608 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4608 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4608 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4608 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4608 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 4608 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4608 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4608 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4608 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4608 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4608 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4608 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4608 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4608 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4608 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4608 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4608 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4608 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4608 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4608 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4608 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4608 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4608 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4608 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4608 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4608 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4608 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4608 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4608 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4608 + ] + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4608 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4608 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4608 + ] + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4608 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=256,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=256,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..d61e0f1a097d268bc925ee0761464609dfcc1321 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=256,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2563 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 4608 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 4608 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 4608 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 4608 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 4608 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 4608 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 4608 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 4608 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 4608 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 4608 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 4608 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 4608 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 4608 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 4608 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 4608 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 4608 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 4608 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 4608 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 4608 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 4608 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 4608 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 4608 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 4608 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 4608 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4608 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 4608 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 4608 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 4608 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 4608 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4608 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 4608 + ] + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 4608 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 4608 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 4608 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4608 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 4608 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 4608 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 4608 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4608 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4608 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 4608 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 4608 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 4608 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 4608 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 4608 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 4608 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 4608 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4608 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 4608 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4608 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 4608 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4608 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4608 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 4608 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 4608 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4608 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4608 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 4608 + ], + "D_DTYPE": 16 + }, + "97": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 97, + 4608 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4608 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4608 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4608 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4608 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4608 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4608 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4608 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4608 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4608 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4608 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4608 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4608 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4608 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4608 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4608 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4608 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4608 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4608 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4608 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4608 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4608 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4608 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4608 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4608 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4608 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4608 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4608 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4608 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4608 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4608 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4608 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..0836abae1ceb2dc45b3a4f3f83297f43f2e25f88 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2598 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 4608 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 4608 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 4608 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 4608 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 4608 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 4608 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 4608 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 4608 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 4608 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 4608 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4608 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 4608 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 4608 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 4608 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 4608 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 4608 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 4608 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4608 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 4608 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 4608 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4608 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4608 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 4608 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 4608 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 4608 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4608 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4608 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 4608 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 4608 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 4608 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 4608 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 4608 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4608 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 4608 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4608 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 4608 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 4608 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 4608 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 4608 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4608 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 4608 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 4608 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 4608 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4608 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4608 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 4608 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 4608 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 4608 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 4608 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 4608 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 4608 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 4608 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4608 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 4608 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4608 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 4608 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4608 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4608 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 4608 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 4608 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4608 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4608 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 4608 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 4608 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4608 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4608 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4608 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4608 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4608 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4608 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4608 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4608 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4608 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4608 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4608 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4608 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4608 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4608 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4608 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4608 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4608 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4608 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4608 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4608 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4608 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4608 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4608 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4608 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4608 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4608 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4608 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4608 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 560, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4608 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=512,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=512,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..a0311825dc4c58e8be82052e2163c8ac6e10a8f7 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=512,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2571 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 4608 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 4608 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 4608 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 4608 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 4608 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 4608 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 4608 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 4608 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 4608 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 4608 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 4608 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 4608 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 4608 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 4608 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 4608 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 4608 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 4608 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 19, + 4608 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 20, + 4608 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 4608 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 4608 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 4608 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 37, + 4608 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 4608 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4608 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 4608 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 4608 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 4608 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 4608 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 4608 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 4608 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 4608 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 4608 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 4608 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4608 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4608 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 4608 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4608 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4608 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 4608 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 4608 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 4608 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4608 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4608 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4608 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4608 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 4608 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 4608 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4608 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4608 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 4608 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 4608 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4608 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4608 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4608 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4608 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4608 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4608 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4608 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4608 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4608 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4608 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4608 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4608 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4608 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4608 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4608 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4608 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4608 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4608 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4608 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4608 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4608 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4608 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4608 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4608 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4608 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4608 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4608 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4608 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4608 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4608 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4608 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..1370f285378cc443c0d76f59deb96f5c0854e83d --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2608 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4608 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4608 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4608 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4608 + ] + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 4608 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 4608 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 4608 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4608 + ] + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 4608 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 4608 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 4608 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 4608 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 4608 + ] + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4608 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 4608 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 4608 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 4608 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 4608 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 4608 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 4608 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 4608 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 4608 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 4608 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 4608 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 4608 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 4608 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 4608 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 4608 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4608 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4608 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 4608 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 4608 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4608 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 4608 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4608 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 4608 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4608 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4608 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 4608 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 4608 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 4608 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 4608 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4608 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 240, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4608 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 4608 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 4608 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4608 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4608 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 4608 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 4608 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 4608 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 4608 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 4608 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 4608 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 4608 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4608 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 4608 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4608 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 4608 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4608 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4608 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 4608 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4608 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 28, + "NUM_CUS_STREAMK": 75, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 4608 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4608 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4608 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4608 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4608 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4608 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4608 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4608 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4608 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4608 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4608 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4608 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4608 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4608 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4608 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4608 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4608 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4608 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4608 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4608 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4608 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4608 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4608 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4608 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4608 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4608 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4608 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4608 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4608 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..994e8521723fd54685cc8e014213d93a9df4890e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2592 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4608 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4608 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4608 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4608 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4608 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4608 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4608 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4608 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4608 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4608 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4608 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4608 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 4608 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 4608 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 4608 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 28, + 4608 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 4608 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 30, + 4608 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 31, + 4608 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 32, + 4608 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 4608 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 4608 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 4608 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 4608 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 4608 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 4608 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 4608 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 4608 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 4608 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 4608 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 43, + 4608 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 44, + 4608 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 45, + 4608 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 46, + 4608 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 47, + 4608 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 48, + 4608 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 49, + 4608 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 50, + 4608 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 4608 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 4608 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 53, + 4608 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 54, + 4608 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 4608 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 56, + 4608 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 4608 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 58, + 4608 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 4608 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 60, + 4608 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 4608 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 62, + 4608 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 4608 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 64, + 4608 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 4608 + ] + }, + "66": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 4608 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 67, + 4608 + ] + }, + "68": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 4608 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 4608 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 4608 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 4608 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 4608 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 4608 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 4608 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 4608 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 4608 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 4608 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 4608 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4608 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 4608 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 4608 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 4608 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 4608 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 4608 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 4608 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 4608 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 4608 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 4608 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 4608 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 4608 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 4608 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 4608 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 4608 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 4608 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 4608 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 4608 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 4608 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 4608 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 4608 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 4608 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 4608 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 4608 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 4608 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 4608 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 4608 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 4608 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 4608 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 4608 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 4608 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 4608 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 4608 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 4608 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 4608 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 4608 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 4608 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 4608 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 4608 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 4608 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 4608 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 4608 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 4608 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 4608 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2010a72ad4993019edddaed550d3a311821724 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=4608,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 4608 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 4608 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 4608 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 4608 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 4608 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 4608 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 4608 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 4608 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 4608 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 4608 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 4608 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 4608 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 4608 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 4608 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 4608 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 4608 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 4608 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 4608 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 4608 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 4608 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 4608 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 4608 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 4608 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 4608 + ] + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 4608 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 4608 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 4608 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 4608 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 4608 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 4608 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 4608 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 4608 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 4608 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 4608 + ] + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 4608 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 4608 + ] + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 4608 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 4608 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 4608 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 4608 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 4608 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 4608 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 4608 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 4608 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 4608 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 4608 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 4608 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 4608 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 4608 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 4608 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 4608 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 4608 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 4608 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 4608 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 4608 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 4608 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 4608 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 4608 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 4608 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 4608 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 4608 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 4608 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 4608 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 4608 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 4608 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 4608 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 4608 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 4608 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 4608 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 4608 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 4608 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 4608 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 4608 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 4608 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 4608 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 4608 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 4608 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 4608 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 4608 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 4608 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 4608 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 4608 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 4608 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 4608 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 4608 + ] + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 4608 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 4608 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 4608 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 4608 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 4608 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 4608 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 4608 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 4608 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 4608 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 4608 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 4608 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 4608 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 4608 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 4608 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 4608 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 4608 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 4608 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 4608 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 4608 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 4608 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 4608 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 4608 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 4608 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 4608 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 4608 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 4608 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 4608 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 4608 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 4608 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 4608 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 4608 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 4608 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 4608 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 4608 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 4608 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 4608 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 4608 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 4608 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 4608 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 4608 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 4608 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 4608 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 4608 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..c59bca48f523745098e4afb4aa12eb0059159080 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 512 + ] + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 512 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 512 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 512 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 512 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 512 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 512 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 512 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 512 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 512 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 512 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 512 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 512 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 512 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 512 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 512 + ] + }, + "17": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 512 + ] + }, + "18": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 512 + ] + }, + "19": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 512 + ] + }, + "20": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 512 + ] + }, + "21": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 512 + ] + }, + "22": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 512 + ] + }, + "23": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 512 + ] + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 512 + ] + }, + "25": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 512 + ] + }, + "26": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 512 + ] + }, + "27": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 512 + ] + }, + "28": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 28, + 512 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 512 + ] + }, + "30": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 512 + ] + }, + "31": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 512 + ] + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 32, + 512 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 512 + ] + }, + "34": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 512 + ] + }, + "35": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 512 + ] + }, + "36": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 512 + ] + }, + "37": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 512 + ] + }, + "38": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 512 + ] + }, + "39": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 512 + ] + }, + "40": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 512 + ] + }, + "41": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 512 + ] + }, + "42": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 512 + ] + }, + "43": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 512 + ] + }, + "44": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 512 + ] + }, + "45": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 512 + ] + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 512 + ] + }, + "47": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 512 + ] + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 512 + ] + }, + "49": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 512 + ] + }, + "50": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 512 + ] + }, + "51": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 512 + ] + }, + "52": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 512 + ] + }, + "53": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 512 + ] + }, + "54": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 512 + ] + }, + "55": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 512 + ] + }, + "56": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 512 + ] + }, + "57": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 512 + ] + }, + "58": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 512 + ] + }, + "59": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 512 + ] + }, + "60": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 512 + ] + }, + "61": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 512 + ] + }, + "62": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 512 + ] + }, + "63": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 512 + ] + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 512 + ] + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 512 + ] + }, + "66": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 512 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 512 + ] + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 512 + ] + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 512 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 512 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 512 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 512 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 512 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 512 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 512 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 512 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 512 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 512 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 512 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 512 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 512 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 512 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 512 + ] + }, + "84": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 512 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 512 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 512 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 512 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 512 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 512 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 512 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 512 + ] + }, + "92": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 512 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 512 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 512 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 512 + ] + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 512 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 512 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 512 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 512 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 512 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 512 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 512 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 512 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 512 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 512 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 512 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 512 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 512 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 512 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 512 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 512 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 512 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 512 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 512 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 512 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 512 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 512 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 512 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 512 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 512 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 512 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 512 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 512 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 512 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 512 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 512 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 512 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 512 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..8b502216c6261a3f18d7eeb92cc9f90f9ccf794f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 512 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 512 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 512 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 512 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 512 + ] + }, + "6": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 6, + 512 + ], + "D_DTYPE": 16 + }, + "7": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 7, + 512 + ], + "D_DTYPE": 16 + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 512 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 512 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 512 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 512 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 512 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 512 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 512 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 512 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 512 + ] + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 512 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 512 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 512 + ] + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 512 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 512 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 512 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 512 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 512 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 512 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 512 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 512 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 512 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 512 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 512 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 512 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 512 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 512 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 512 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 512 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 512 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 512 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 512 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 512 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 512 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 512 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 512 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 512 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 512 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 512 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 512 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 512 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 512 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 512 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 512 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 512 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 512 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 512 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 512 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 512 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 512 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 512 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 512 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 512 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 512 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 512 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 512 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 512 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 512 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 512 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 512 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 512 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 512 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 512 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 512 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 512 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 512 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 512 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 512 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 512 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 512 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 512 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 512 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 512 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 512 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 512 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 512 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 512 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 512 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 512 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 512 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 512 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 512 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 512 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 512 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 512 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 512 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 512 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 512 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 512 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 512 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 512 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 512 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 512 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 512 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 512 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 512 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 512 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 512 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 512 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 512 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 512 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 512 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 512 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 512 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 512 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 512 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 512 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 512 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 512 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 512 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 512 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 512 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 512 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 512 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 512 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 512 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 512 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 512 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 512 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 512 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 512 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 512 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..59adbaef7410f370350c96d91c604794968195c3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2575 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 512 + ] + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 512 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 512 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 512 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 512 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 512 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 512 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 512 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 512 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 512 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 512 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 512 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 512 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 512 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 512 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 512 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 512 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 512 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 512 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 512 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 512 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 512 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 512 + ] + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 512 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 512 + ] + }, + "26": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 512 + ] + }, + "27": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 512 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 512 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 512 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 512 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 512 + ] + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 512 + ] + }, + "33": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 512 + ] + }, + "34": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 512 + ] + }, + "35": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 512 + ] + }, + "36": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 512 + ] + }, + "37": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 512 + ] + }, + "38": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 512 + ] + }, + "39": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 512 + ] + }, + "40": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 512 + ] + }, + "41": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 512 + ] + }, + "42": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 512 + ] + }, + "43": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 512 + ] + }, + "44": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 512 + ] + }, + "45": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 512 + ] + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 512 + ] + }, + "47": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 512 + ] + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 512 + ] + }, + "49": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 512 + ] + }, + "50": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 512 + ] + }, + "51": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 512 + ] + }, + "52": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 512 + ] + }, + "53": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 512 + ] + }, + "54": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 512 + ] + }, + "55": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 512 + ] + }, + "56": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 512 + ] + }, + "57": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 512 + ] + }, + "58": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 512 + ] + }, + "59": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 512 + ] + }, + "60": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 512 + ] + }, + "61": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 512 + ] + }, + "62": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 512 + ] + }, + "63": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 512 + ] + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 512 + ] + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 512 + ] + }, + "66": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 512 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 512 + ] + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 512 + ] + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 512 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 512 + ] + }, + "71": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 512 + ] + }, + "72": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 512 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 512 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 512 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 512 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 512 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 512 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 512 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 512 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 512 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 512 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 512 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 512 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 512 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 512 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 512 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 512 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 512 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 512 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 512 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 512 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 512 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 512 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 512 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 512 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 512 + ] + }, + "97": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 512 + ] + }, + "98": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 98, + 512 + ] + }, + "99": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 512 + ] + }, + "100": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 100, + 512 + ] + }, + "101": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 512 + ] + }, + "102": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 102, + 512 + ] + }, + "103": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 512 + ] + }, + "104": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 104, + 512 + ] + }, + "105": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 512 + ] + }, + "106": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 106, + 512 + ] + }, + "107": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 512 + ] + }, + "108": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 108, + 512 + ] + }, + "109": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 512 + ] + }, + "110": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 110, + 512 + ] + }, + "111": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 512 + ] + }, + "112": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 112, + 512 + ] + }, + "113": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 512 + ] + }, + "114": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 114, + 512 + ] + }, + "115": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 512 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 512 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 512 + ] + }, + "118": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 118, + 512 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 512 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 512 + ] + }, + "121": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 512 + ] + }, + "122": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 122, + 512 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 512 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 512 + ] + }, + "125": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 512 + ] + }, + "126": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 126, + 512 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 512 + ] + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 128, + 512 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..f2406d774acc2cd18a9437af39be56e08af292d1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 512 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 512 + ] + }, + "3": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 3, + 512 + ], + "D_DTYPE": 16 + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 512 + ] + }, + "5": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 5, + 512 + ], + "D_DTYPE": 16 + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 512 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 512 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 512 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 512 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 512 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 512 + ] + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 512 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 512 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 512 + ] + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 512 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 512 + ] + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 512 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 512 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 512 + ] + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 512 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 512 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 512 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 512 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 512 + ] + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 512 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 512 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 512 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 512 + ] + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 512 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 512 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 512 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 512 + ] + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 512 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 512 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 512 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 512 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 512 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 512 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 512 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 512 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 512 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 512 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 512 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 512 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 512 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 512 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 512 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 512 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 512 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 512 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 512 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 512 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 512 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 512 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 512 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 512 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 512 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 512 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 512 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 512 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 512 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 512 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 512 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 512 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 512 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 512 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 512 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 512 + ] + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 512 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 512 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 512 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 512 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 512 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 512 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 512 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 512 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 512 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 512 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 512 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 512 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 512 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 512 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 512 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 512 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 512 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 512 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 512 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 512 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 512 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 512 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 512 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 512 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 512 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 512 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 512 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 512 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 512 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 512 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 512 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 512 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 512 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 512 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 512 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 512 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 512 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 512 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 512 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 512 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 512 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 512 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 512 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 512 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 512 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 512 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 512 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 512 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 512 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 512 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 512 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 512 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 512 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 512 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 512 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 512 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 512 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 512 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 512 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 512 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..b66a62a4d24613c3f7cc2849f82c61639a51abb1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2571 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 512 + ] + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 512 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 512 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 512 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 512 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 512 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 512 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 512 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 512 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 512 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 512 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 512 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 512 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 512 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 512 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 512 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 512 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 512 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 512 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 512 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 512 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 512 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 512 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 512 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 512 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 512 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 512 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 512 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 512 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 512 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 512 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 512 + ] + }, + "33": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 512 + ] + }, + "34": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 512 + ] + }, + "35": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 512 + ] + }, + "36": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 512 + ] + }, + "37": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 512 + ] + }, + "38": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 512 + ] + }, + "39": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 512 + ] + }, + "40": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 512 + ] + }, + "41": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 512 + ] + }, + "42": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 512 + ] + }, + "43": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 512 + ] + }, + "44": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 512 + ] + }, + "45": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 512 + ] + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 512 + ] + }, + "47": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 512 + ] + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 512 + ] + }, + "49": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 512 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 50, + 512 + ] + }, + "51": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 512 + ] + }, + "52": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 52, + 512 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 512 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 54, + 512 + ] + }, + "55": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 512 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 56, + 512 + ] + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 512 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 58, + 512 + ] + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 59, + 512 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 60, + 512 + ] + }, + "61": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 512 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 62, + 512 + ] + }, + "63": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 512 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 64, + 512 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 512 + ] + }, + "66": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 512 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 512 + ] + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 512 + ] + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 512 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 512 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 512 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 512 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 512 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 512 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 512 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 512 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 512 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 512 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 512 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 512 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 512 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 512 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 512 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 512 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 512 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 512 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 512 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 512 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 512 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 512 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 512 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 512 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 512 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 512 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 512 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 512 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 512 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 512 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 512 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 512 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 512 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 512 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 512 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 512 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 512 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 512 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 512 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 512 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 512 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 512 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 512 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 512 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 512 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 512 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 512 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 512 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 512 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 512 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 512 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 512 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 512 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 512 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 512 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 512 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 512 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 512 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 512 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 512 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..ee88d49902ccd327973ca7ebd0559419b8e3639f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=512,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 1, + 512 + ], + "D_DTYPE": 16 + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 512 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 512 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 512 + ] + }, + "5": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 5, + 512 + ], + "D_DTYPE": 16 + }, + "6": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 6, + 512 + ], + "D_DTYPE": 16 + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 512 + ] + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 512 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 512 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 512 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 512 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 512 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 512 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 512 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 512 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 512 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 512 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 512 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 512 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 512 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 512 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 512 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 512 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 512 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 512 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 512 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 512 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 512 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 512 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 512 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 512 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 512 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 512 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 512 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 512 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 512 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 512 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 512 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 512 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 512 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 512 + ] + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 512 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 512 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 512 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 512 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 512 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 512 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 512 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 512 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 512 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 512 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 512 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 512 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 512 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 512 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 512 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 512 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 512 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 512 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 512 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 512 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 512 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 512 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 512 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 512 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 512 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 512 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 512 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 512 + ] + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 512 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 512 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 512 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 512 + ] + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 512 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 512 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 512 + ] + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 512 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 512 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 512 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 512 + ] + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 512 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 512 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 512 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 512 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 512 + ] + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 512 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 512 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 512 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 512 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 512 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 512 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 512 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 512 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 512 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 512 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 512 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 512 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 512 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 512 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 512 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 512 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 512 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 512 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 512 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 512 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 512 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 512 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 512 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 512 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 512 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 512 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 512 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 512 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 512 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 512 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 512 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 512 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 512 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 512 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 512 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 512 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 512 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 512 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 512 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 512 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 512 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 512 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 512 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..fa1d14afd71db3da1546519a05485ba28f98e505 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2583 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 576 + ] + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 576 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 576 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 4, + 576 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 5, + 576 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 6, + 576 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 7, + 576 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 8, + 576 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 9, + 576 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 10, + 576 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 11, + 576 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 12, + 576 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 13, + 576 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 14, + 576 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 15, + 576 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 16, + 576 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 17, + 576 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 18, + 576 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 19, + 576 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 20, + 576 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 21, + 576 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 22, + 576 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 23, + 576 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 24, + 576 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 25, + 576 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 26, + 576 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 27, + 576 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 28, + 576 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 29, + 576 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 30, + 576 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 31, + 576 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 32, + 576 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 576 + ] + }, + "34": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 576 + ] + }, + "35": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 576 + ] + }, + "36": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 576 + ] + }, + "37": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 576 + ] + }, + "38": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 576 + ] + }, + "39": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 576 + ] + }, + "40": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 576 + ] + }, + "41": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 576 + ] + }, + "42": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 576 + ] + }, + "43": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 576 + ] + }, + "44": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 576 + ] + }, + "45": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 576 + ] + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 576 + ] + }, + "47": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 576 + ] + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 576 + ] + }, + "49": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 49, + 576 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 50, + 576 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 51, + 576 + ], + "D_DTYPE": 16 + }, + "52": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 576 + ] + }, + "53": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 576 + ] + }, + "54": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 54, + 576 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 55, + 576 + ], + "D_DTYPE": 16 + }, + "56": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 576 + ] + }, + "57": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 57, + 576 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 576 + ] + }, + "59": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 59, + 576 + ], + "D_DTYPE": 16 + }, + "60": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 576 + ] + }, + "61": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 61, + 576 + ], + "D_DTYPE": 16 + }, + "62": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 576 + ] + }, + "63": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 63, + 576 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 576 + ] + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 576 + ] + }, + "66": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 576 + ] + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 576 + ] + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 576 + ] + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 576 + ] + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 576 + ] + }, + "71": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 576 + ] + }, + "72": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 576 + ] + }, + "73": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 576 + ] + }, + "74": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 576 + ] + }, + "75": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 576 + ] + }, + "76": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 576 + ] + }, + "77": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 576 + ] + }, + "78": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 576 + ] + }, + "79": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 576 + ] + }, + "80": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 576 + ] + }, + "81": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 576 + ] + }, + "82": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 576 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 83, + 576 + ] + }, + "84": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 576 + ] + }, + "85": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 85, + 576 + ] + }, + "86": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 576 + ] + }, + "87": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 87, + 576 + ] + }, + "88": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 576 + ] + }, + "89": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 576 + ] + }, + "90": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 90, + 576 + ] + }, + "91": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 576 + ] + }, + "92": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 92, + 576 + ] + }, + "93": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 576 + ] + }, + "94": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 94, + 576 + ] + }, + "95": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 576 + ] + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 576 + ] + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 97, + 576 + ] + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 98, + 576 + ] + }, + "99": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 99, + 576 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 100, + 576 + ] + }, + "101": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 101, + 576 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 102, + 576 + ] + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 103, + 576 + ] + }, + "104": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 576 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 105, + 576 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 106, + 576 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 107, + 576 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 108, + 576 + ] + }, + "109": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 576 + ] + }, + "110": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 576 + ] + }, + "111": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 576 + ] + }, + "112": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 112, + 576 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 113, + 576 + ] + }, + "114": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 576 + ] + }, + "115": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 115, + 576 + ] + }, + "116": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 116, + 576 + ] + }, + "117": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 576 + ] + }, + "118": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 576 + ] + }, + "119": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 576 + ] + }, + "120": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 576 + ] + }, + "121": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 121, + 576 + ] + }, + "122": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 576 + ] + }, + "123": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 576 + ] + }, + "124": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 576 + ] + }, + "125": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 576 + ] + }, + "126": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 576 + ] + }, + "127": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 576 + ] + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 128, + 576 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..a456cdcbbe0a2da8297a50fa4ab7bf29d181ee48 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 576 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 576 + ] + }, + "3": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 3, + 576 + ], + "D_DTYPE": 16 + }, + "4": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 4, + 576 + ], + "D_DTYPE": 16 + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 576 + ] + }, + "6": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 6, + 576 + ], + "D_DTYPE": 16 + }, + "7": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 7, + 576 + ], + "D_DTYPE": 16 + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 576 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 576 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 576 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 576 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 576 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 576 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 576 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 576 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 576 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 576 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 576 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 576 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 576 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 576 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 576 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 576 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 576 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 576 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 576 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 576 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 576 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 576 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 576 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 576 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 576 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 576 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 576 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 576 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 576 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 576 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 576 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 576 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 576 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 576 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 576 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 576 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 576 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 576 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 576 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 576 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 576 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 576 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 576 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 576 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 576 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 576 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 576 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 576 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 576 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 576 + ] + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 576 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 576 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 576 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 576 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 576 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 576 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 576 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 576 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 576 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 576 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 576 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 576 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 576 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 576 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 576 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 576 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 576 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 576 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 576 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 576 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 576 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 576 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 576 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 576 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 576 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 576 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 576 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 576 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 576 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 576 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 576 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 576 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 576 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 576 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 576 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 576 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 576 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 576 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 576 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 576 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 576 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 576 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 576 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 576 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 576 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 576 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 576 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 576 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 576 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 576 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 576 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 576 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 576 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 576 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 576 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 576 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 576 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 576 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 576 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 576 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 576 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 576 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 576 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 576 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 576 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 576 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 576 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 576 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 576 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 576 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 576 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=256,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=256,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..70c9509c6a1bd7958d87df17ec9310035ec681e0 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=256,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 576 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 576 + ], + "D_DTYPE": 16 + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 576 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 576 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 576 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 576 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 576 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 576 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 576 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 10, + 576 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 576 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 576 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 576 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 576 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 576 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 576 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 17, + 576 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 18, + 576 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 576 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 576 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 576 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 576 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 576 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 576 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 576 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 576 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 576 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 576 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 576 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 576 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 576 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 576 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 576 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 576 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 576 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 576 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 576 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 576 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 576 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 576 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 576 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 576 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 576 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 576 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 576 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 576 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 576 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 576 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 576 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 576 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 576 + ], + "D_DTYPE": 16 + }, + "52": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 52, + 576 + ], + "D_DTYPE": 16 + }, + "53": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 576 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 576 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 576 + ], + "D_DTYPE": 16 + }, + "56": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 56, + 576 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 576 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 58, + 576 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 576 + ], + "D_DTYPE": 16 + }, + "60": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 60, + 576 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 576 + ], + "D_DTYPE": 16 + }, + "62": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 62, + 576 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 576 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 64, + 576 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 576 + ], + "D_DTYPE": 16 + }, + "66": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 576 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 576 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 576 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 576 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 576 + ], + "D_DTYPE": 16 + }, + "71": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 71, + 576 + ], + "D_DTYPE": 16 + }, + "72": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 72, + 576 + ], + "D_DTYPE": 16 + }, + "73": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 73, + 576 + ], + "D_DTYPE": 16 + }, + "74": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 74, + 576 + ], + "D_DTYPE": 16 + }, + "75": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 75, + 576 + ], + "D_DTYPE": 16 + }, + "76": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 76, + 576 + ], + "D_DTYPE": 16 + }, + "77": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 77, + 576 + ], + "D_DTYPE": 16 + }, + "78": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 78, + 576 + ], + "D_DTYPE": 16 + }, + "79": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 79, + 576 + ], + "D_DTYPE": 16 + }, + "80": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 80, + 576 + ], + "D_DTYPE": 16 + }, + "81": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 81, + 576 + ], + "D_DTYPE": 16 + }, + "82": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 82, + 576 + ], + "D_DTYPE": 16 + }, + "83": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 83, + 576 + ], + "D_DTYPE": 16 + }, + "84": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 84, + 576 + ], + "D_DTYPE": 16 + }, + "85": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 85, + 576 + ], + "D_DTYPE": 16 + }, + "86": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 86, + 576 + ], + "D_DTYPE": 16 + }, + "87": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 87, + 576 + ], + "D_DTYPE": 16 + }, + "88": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 88, + 576 + ], + "D_DTYPE": 16 + }, + "89": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 89, + 576 + ], + "D_DTYPE": 16 + }, + "90": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 90, + 576 + ], + "D_DTYPE": 16 + }, + "91": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 91, + 576 + ], + "D_DTYPE": 16 + }, + "92": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 92, + 576 + ], + "D_DTYPE": 16 + }, + "93": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 93, + 576 + ], + "D_DTYPE": 16 + }, + "94": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 94, + 576 + ], + "D_DTYPE": 16 + }, + "95": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 95, + 576 + ], + "D_DTYPE": 16 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 96, + 576 + ], + "D_DTYPE": 16 + }, + "97": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 97, + 576 + ], + "D_DTYPE": 16 + }, + "98": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 98, + 576 + ], + "D_DTYPE": 16 + }, + "99": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 99, + 576 + ], + "D_DTYPE": 16 + }, + "100": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 100, + 576 + ], + "D_DTYPE": 16 + }, + "101": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 101, + 576 + ], + "D_DTYPE": 16 + }, + "102": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 102, + 576 + ], + "D_DTYPE": 16 + }, + "103": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 103, + 576 + ], + "D_DTYPE": 16 + }, + "104": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 104, + 576 + ], + "D_DTYPE": 16 + }, + "105": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 105, + 576 + ], + "D_DTYPE": 16 + }, + "106": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 106, + 576 + ], + "D_DTYPE": 16 + }, + "107": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 107, + 576 + ], + "D_DTYPE": 16 + }, + "108": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 108, + 576 + ], + "D_DTYPE": 16 + }, + "109": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 109, + 576 + ], + "D_DTYPE": 16 + }, + "110": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 110, + 576 + ], + "D_DTYPE": 16 + }, + "111": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 111, + 576 + ], + "D_DTYPE": 16 + }, + "112": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 112, + 576 + ], + "D_DTYPE": 16 + }, + "113": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 113, + 576 + ], + "D_DTYPE": 16 + }, + "114": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 114, + 576 + ], + "D_DTYPE": 16 + }, + "115": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 115, + 576 + ], + "D_DTYPE": 16 + }, + "116": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 116, + 576 + ], + "D_DTYPE": 16 + }, + "117": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 117, + 576 + ], + "D_DTYPE": 16 + }, + "118": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 118, + 576 + ], + "D_DTYPE": 16 + }, + "119": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 119, + 576 + ], + "D_DTYPE": 16 + }, + "120": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 120, + 576 + ], + "D_DTYPE": 16 + }, + "121": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 121, + 576 + ], + "D_DTYPE": 16 + }, + "122": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 122, + 576 + ], + "D_DTYPE": 16 + }, + "123": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 123, + 576 + ], + "D_DTYPE": 16 + }, + "124": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 124, + 576 + ], + "D_DTYPE": 16 + }, + "125": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 125, + 576 + ], + "D_DTYPE": 16 + }, + "126": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 126, + 576 + ], + "D_DTYPE": 16 + }, + "127": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 127, + 576 + ], + "D_DTYPE": 16 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 128, + 576 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..8820e3836737a4465c5b6d0b7adcaaecb3187a50 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 576 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 576 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 576 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 576 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 576 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 576 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 576 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 576 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 576 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 576 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 576 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 576 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 576 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 576 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 576 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 576 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 576 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 576 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 576 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 576 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 576 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 576 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 576 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 576 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 576 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 576 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 576 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 576 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 576 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 576 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 576 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 576 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 576 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 576 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 576 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 576 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 576 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 576 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 576 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 576 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 576 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 576 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 576 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 576 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 576 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 576 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 576 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 576 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 576 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 576 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 576 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 576 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 576 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 576 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 576 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 576 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 576 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 576 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 576 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 576 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 576 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 576 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 576 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 576 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 576 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 576 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 576 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 576 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 576 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 576 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 576 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 576 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 576 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 576 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 576 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 576 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 576 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 576 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 576 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 576 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 576 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 576 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 576 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 576 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 576 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 576 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 576 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 576 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 576 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 576 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 576 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 576 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 576 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 576 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 576 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 576 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 576 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 576 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 576 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 576 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 576 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 576 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 576 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 576 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 576 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 576 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 576 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 576 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 576 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 576 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 576 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 576 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 576 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 576 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 576 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 576 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 576 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 576 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 576 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 576 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 576 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 576 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 576 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 576 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 576 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 576 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 576 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 576 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=512,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=512,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..d5159192873e271feb86eb77f1afce3bf87b3af4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=512,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 1, + 576 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 2, + 576 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 3, + 576 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 4, + 576 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 5, + 576 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 6, + 576 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 7, + 576 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 8, + 576 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 9, + 576 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 10, + 576 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 11, + 576 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 12, + 576 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 13, + 576 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 14, + 576 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 15, + 576 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 16, + 576 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 17, + 576 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 18, + 576 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 19, + 576 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 20, + 576 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 21, + 576 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 22, + 576 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 23, + 576 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 24, + 576 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 25, + 576 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 26, + 576 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 27, + 576 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 28, + 576 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 29, + 576 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 30, + 576 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 31, + 576 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 32, + 576 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 33, + 576 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 34, + 576 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 35, + 576 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 36, + 576 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 37, + 576 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 38, + 576 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 39, + 576 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 40, + 576 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 41, + 576 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 42, + 576 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 43, + 576 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 44, + 576 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 45, + 576 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 46, + 576 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 47, + 576 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 48, + 576 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 49, + 576 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 50, + 576 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 51, + 576 + ], + "D_DTYPE": 16 + }, + "52": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 52, + 576 + ], + "D_DTYPE": 16 + }, + "53": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 53, + 576 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 54, + 576 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 55, + 576 + ], + "D_DTYPE": 16 + }, + "56": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 56, + 576 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 57, + 576 + ], + "D_DTYPE": 16 + }, + "58": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 58, + 576 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 59, + 576 + ], + "D_DTYPE": 16 + }, + "60": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 60, + 576 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 61, + 576 + ], + "D_DTYPE": 16 + }, + "62": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 62, + 576 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 63, + 576 + ], + "D_DTYPE": 16 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 64, + 576 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 576 + ], + "D_DTYPE": 16 + }, + "66": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 576 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 576 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 576 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 576 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 576 + ], + "D_DTYPE": 16 + }, + "71": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 71, + 576 + ], + "D_DTYPE": 16 + }, + "72": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 72, + 576 + ], + "D_DTYPE": 16 + }, + "73": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 73, + 576 + ], + "D_DTYPE": 16 + }, + "74": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 74, + 576 + ], + "D_DTYPE": 16 + }, + "75": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 75, + 576 + ], + "D_DTYPE": 16 + }, + "76": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 76, + 576 + ], + "D_DTYPE": 16 + }, + "77": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 77, + 576 + ], + "D_DTYPE": 16 + }, + "78": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 78, + 576 + ], + "D_DTYPE": 16 + }, + "79": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 79, + 576 + ], + "D_DTYPE": 16 + }, + "80": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 80, + 576 + ], + "D_DTYPE": 16 + }, + "81": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 81, + 576 + ], + "D_DTYPE": 16 + }, + "82": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 82, + 576 + ], + "D_DTYPE": 16 + }, + "83": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 83, + 576 + ], + "D_DTYPE": 16 + }, + "84": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 84, + 576 + ], + "D_DTYPE": 16 + }, + "85": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 85, + 576 + ], + "D_DTYPE": 16 + }, + "86": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 86, + 576 + ], + "D_DTYPE": 16 + }, + "87": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 87, + 576 + ], + "D_DTYPE": 16 + }, + "88": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 88, + 576 + ], + "D_DTYPE": 16 + }, + "89": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 89, + 576 + ], + "D_DTYPE": 16 + }, + "90": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 90, + 576 + ], + "D_DTYPE": 16 + }, + "91": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 91, + 576 + ], + "D_DTYPE": 16 + }, + "92": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 92, + 576 + ], + "D_DTYPE": 16 + }, + "93": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 93, + 576 + ], + "D_DTYPE": 16 + }, + "94": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 94, + 576 + ], + "D_DTYPE": 16 + }, + "95": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 95, + 576 + ], + "D_DTYPE": 16 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 96, + 576 + ], + "D_DTYPE": 16 + }, + "97": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 97, + 576 + ], + "D_DTYPE": 16 + }, + "98": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 98, + 576 + ], + "D_DTYPE": 16 + }, + "99": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 99, + 576 + ], + "D_DTYPE": 16 + }, + "100": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 100, + 576 + ], + "D_DTYPE": 16 + }, + "101": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 101, + 576 + ], + "D_DTYPE": 16 + }, + "102": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 102, + 576 + ], + "D_DTYPE": 16 + }, + "103": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 103, + 576 + ], + "D_DTYPE": 16 + }, + "104": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 104, + 576 + ], + "D_DTYPE": 16 + }, + "105": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 105, + 576 + ], + "D_DTYPE": 16 + }, + "106": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 106, + 576 + ], + "D_DTYPE": 16 + }, + "107": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 107, + 576 + ], + "D_DTYPE": 16 + }, + "108": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 108, + 576 + ], + "D_DTYPE": 16 + }, + "109": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 109, + 576 + ], + "D_DTYPE": 16 + }, + "110": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 110, + 576 + ], + "D_DTYPE": 16 + }, + "111": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 111, + 576 + ], + "D_DTYPE": 16 + }, + "112": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 112, + 576 + ], + "D_DTYPE": 16 + }, + "113": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 113, + 576 + ], + "D_DTYPE": 16 + }, + "114": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 114, + 576 + ], + "D_DTYPE": 16 + }, + "115": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 115, + 576 + ], + "D_DTYPE": 16 + }, + "116": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 116, + 576 + ], + "D_DTYPE": 16 + }, + "117": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 117, + 576 + ], + "D_DTYPE": 16 + }, + "118": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 118, + 576 + ], + "D_DTYPE": 16 + }, + "119": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 119, + 576 + ], + "D_DTYPE": 16 + }, + "120": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 120, + 576 + ], + "D_DTYPE": 16 + }, + "121": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 121, + 576 + ], + "D_DTYPE": 16 + }, + "122": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 122, + 576 + ], + "D_DTYPE": 16 + }, + "123": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 123, + 576 + ], + "D_DTYPE": 16 + }, + "124": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 124, + 576 + ], + "D_DTYPE": 16 + }, + "125": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 125, + 576 + ], + "D_DTYPE": 16 + }, + "126": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 126, + 576 + ], + "D_DTYPE": 16 + }, + "127": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 127, + 576 + ], + "D_DTYPE": 16 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 128, + 576 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..61145813e8b6c9f03aa857feb51b3ae954ed651f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 576 + ] + }, + "2": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 2, + 576 + ], + "D_DTYPE": 16 + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 576 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 576 + ] + }, + "5": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 5, + 576 + ], + "D_DTYPE": 16 + }, + "6": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 6, + 576 + ], + "D_DTYPE": 16 + }, + "7": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 7, + 576 + ], + "D_DTYPE": 16 + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 576 + ] + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 576 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 576 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 576 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 576 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 576 + ] + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 576 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 576 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 576 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 576 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 576 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 576 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 576 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 576 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 576 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 576 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 576 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 576 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 576 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 576 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 576 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 576 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 576 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 576 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 576 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 576 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 34, + 576 + ] + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 576 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 36, + 576 + ] + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 37, + 576 + ] + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 576 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 576 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 40, + 576 + ] + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 576 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 576 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 43, + 576 + ] + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 576 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 576 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 576 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 576 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 576 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 576 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 576 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 576 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 576 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 576 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 576 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 576 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 576 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 576 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 576 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 576 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 576 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 576 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 576 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 576 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 576 + ] + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 576 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 576 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 576 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 576 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 576 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 576 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 576 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 576 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 576 + ] + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 576 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 576 + ] + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 576 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 576 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 576 + ] + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 576 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 576 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 576 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 576 + ] + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 576 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 576 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 576 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 576 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 576 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 576 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 576 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 576 + ] + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 576 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 576 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 576 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 576 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 576 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 576 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 576 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 576 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 576 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 576 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 576 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 576 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 576 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 576 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 576 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 576 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 576 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 576 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 576 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 576 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 576 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 576 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 576 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 576 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 576 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 576 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 576 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 576 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 576 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 576 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 576 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 576 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 576 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 576 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 576 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 576 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 576 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 576 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..01acf72f496a1193478727ac4c35573a77cc318a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,42 @@ +{ + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 576 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 576 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..8d59a6225a9bc7701bfb0cc7d904d5c4f9a0b8c5 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=576,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,40 @@ +{ + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 32, + "num_warps": 4, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 576 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 32, + "num_warps": 4, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 576 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..9604674adb87127cd608e440805923f85ac45467 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=1536,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2605 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 7168 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 7168 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 7168 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 7168 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 7168 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 7168 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 7168 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 18, + 7168 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 19, + 7168 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 21, + 7168 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 22, + 7168 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 23, + 7168 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 24, + 7168 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 25, + 7168 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 27, + 7168 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 33, + 7168 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 34, + 7168 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 7168 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 7168 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 7168 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 7168 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 7168 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 7168 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 7168 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 7168 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 51, + 7168 + ] + }, + "52": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 52, + 7168 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 53, + 7168 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 7168 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 7168 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 7168 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 7168 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 7168 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 7168 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 7168 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 7168 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 7168 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 7168 + ] + }, + "64": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 64, + 7168 + ] + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 7168 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 66, + 7168 + ] + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 7168 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 7168 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 7168 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 70, + 7168 + ] + }, + "71": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 73, + 7168 + ] + }, + "74": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 74, + 7168 + ] + }, + "75": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 76, + 7168 + ] + }, + "77": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 77, + 7168 + ] + }, + "78": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 7168 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 7168 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 7168 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 7168 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 7168 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 7168 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 7168 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 7168 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 7168 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 7168 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 7168 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 7168 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 7168 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 7168 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 7168 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 7168 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 7168 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 7168 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 7168 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 7168 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 7168 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 7168 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 7168 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 7168 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 7168 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 7168 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 7168 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 7168 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 7168 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 7168 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 7168 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 7168 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 7168 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 7168 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 7168 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 7168 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 7168 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 7168 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 7168 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 7168 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 7168 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 7168 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..b555aca1f8c2630daefc247049baa688bf730e32 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=1536,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2615 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 7168 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 7168 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 7168 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 7168 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 7168 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 7168 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 7168 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 7168 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 7168 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 7168 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 7168 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 7168 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 7168 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 7168 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 7168 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 7168 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 7168 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 7168 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 7168 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 7168 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 7168 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 7168 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 7168 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 7168 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 7168 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 7168 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 7168 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 7168 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 7168 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 7168 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 7168 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 7168 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 7168 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 7168 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 7168 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 7168 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 7168 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 7168 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 7168 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 7168 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 7168 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 7168 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 7168 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 7168 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 7168 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 7168 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 7168 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 7168 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 7168 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 7168 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 7168 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 7168 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 7168 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 7168 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 7168 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 7168 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 7168 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 7168 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 7168 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 7168 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 80, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 7168 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 7168 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 7168 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 7168 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 7168 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 880, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 7168 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 880, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 7168 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 7168 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 7168 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 880, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 7168 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 7168 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 7168 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 7168 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 7168 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 7168 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 7168 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 7168 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 7168 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 7168 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 7168 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 7168 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 7168 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 7168 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 7168 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 7168 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 7168 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 7168 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 7168 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 7168 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 7168 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 7168 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..5e07b1d82cd9581961bd1879bbe52e7e8d512930 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2048,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2640 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 7168 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 7168 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 7168 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 7168 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 7168 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 7168 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 17, + 7168 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 18, + 7168 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 19, + 7168 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 20, + 7168 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 21, + 7168 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 22, + 7168 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 23, + 7168 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 24, + 7168 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 25, + 7168 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 26, + 7168 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 27, + 7168 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 28, + 7168 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 29, + 7168 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 30, + 7168 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 31, + 7168 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 32, + 7168 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 33, + 7168 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 34, + 7168 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 35, + 7168 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 36, + 7168 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 37, + 7168 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 38, + 7168 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 39, + 7168 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 40, + 7168 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 41, + 7168 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 42, + 7168 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 43, + 7168 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 44, + 7168 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 45, + 7168 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 46, + 7168 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 47, + 7168 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 48, + 7168 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 49, + 7168 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 50, + 7168 + ] + }, + "51": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 51, + 7168 + ] + }, + "52": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 52, + 7168 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 53, + 7168 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 54, + 7168 + ] + }, + "55": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 55, + 7168 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 56, + 7168 + ] + }, + "57": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 57, + 7168 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 58, + 7168 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 7168 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 60, + 7168 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 7168 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 62, + 7168 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 7168 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 64, + 7168 + ] + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 65, + 7168 + ] + }, + "66": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 66, + 7168 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 7168 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 7168 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 7168 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 7168 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 7168 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 7168 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 7168 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 7168 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 7168 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 7168 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 7168 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 7168 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 7168 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 7168 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 7168 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 7168 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 7168 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 7168 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 7168 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 7168 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 7168 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 7168 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 7168 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 98, + 7168 + ] + }, + "99": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 100, + 7168 + ] + }, + "101": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 101, + 7168 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 102, + 7168 + ] + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 7168 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 105, + 7168 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 106, + 7168 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 107, + 7168 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 108, + 7168 + ] + }, + "109": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 109, + 7168 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 110, + 7168 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 111, + 7168 + ] + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 7168 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 7168 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 7168 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 7168 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 7168 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 7168 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 7168 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 7168 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 7168 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 7168 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 7168 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 7168 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 7168 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 7168 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 7168 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 7168 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..c1f19d5127b028fed328c4ba01ca4e6caf77b834 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2048,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2639 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 7, + 7168 + ], + "D_DTYPE": 16 + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 7168 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 7168 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 7168 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 7168 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 7168 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 7168 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 7168 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 7168 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 7168 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 7168 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 7168 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 7168 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 7168 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 7168 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 7168 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 7168 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 7168 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 7168 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 7168 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 7168 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 7168 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 7168 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 7168 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 7168 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 7168 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 7168 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 7168 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 51, + 7168 + ] + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 7168 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 7168 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 7168 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 55, + 7168 + ] + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 7168 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 7168 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 7168 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 7168 + ] + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 7168 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 7168 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 7168 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 7168 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 7168 + ] + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 7168 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 7168 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 7168 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 7168 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 7168 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 7168 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 7168 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 7168 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 7168 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 7168 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 7168 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 7168 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 7168 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 7168 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 7168 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 7168 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 7168 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 7168 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 7168 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 7168 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 7168 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 8, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 7168 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 7168 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 7168 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 7168 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 7168 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 7168 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 7168 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 7168 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 7168 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 7168 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 7168 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 7168 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 7168 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 7168 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 7168 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 7168 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 7168 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 7168 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 7168 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 7168 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 7168 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 7168 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 7168 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 7168 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 7168 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 7168 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 7168 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 7168 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 7168 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 7168 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 7168 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 7168 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..6a6c220224cf9575dfa78c022d183031d2c56094 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2304,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2637 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 7168 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 7168 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 7168 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 7168 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 7168 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 7168 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 17, + 7168 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 18, + 7168 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 19, + 7168 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 20, + 7168 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 21, + 7168 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 22, + 7168 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 23, + 7168 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 24, + 7168 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 25, + 7168 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 26, + 7168 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 27, + 7168 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 28, + 7168 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 29, + 7168 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 30, + 7168 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 31, + 7168 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 32, + 7168 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 33, + 7168 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 34, + 7168 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 35, + 7168 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 36, + 7168 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 37, + 7168 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 38, + 7168 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 39, + 7168 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 40, + 7168 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 41, + 7168 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 42, + 7168 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 43, + 7168 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 44, + 7168 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 45, + 7168 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 46, + 7168 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 47, + 7168 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 48, + 7168 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 49, + 7168 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 50, + 7168 + ] + }, + "51": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 51, + 7168 + ] + }, + "52": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 52, + 7168 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 53, + 7168 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 54, + 7168 + ] + }, + "55": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 55, + 7168 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 56, + 7168 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 7168 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 58, + 7168 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 7168 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 60, + 7168 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 7168 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 62, + 7168 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 7168 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 64, + 7168 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 7168 + ] + }, + "66": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 66, + 7168 + ] + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 67, + 7168 + ] + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 68, + 7168 + ] + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 69, + 7168 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 7168 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 7168 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 7168 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 7168 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 7168 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 7168 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 7168 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 7168 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 7168 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 7168 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 7168 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 7168 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 7168 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 7168 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 7168 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 7168 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 7168 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 7168 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 7168 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 7168 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 7168 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 7168 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 7168 + ] + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 102, + 7168 + ] + }, + "103": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 104, + 7168 + ] + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 105, + 7168 + ] + }, + "106": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 106, + 7168 + ] + }, + "107": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 107, + 7168 + ] + }, + "108": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 108, + 7168 + ] + }, + "109": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 109, + 7168 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 110, + 7168 + ] + }, + "111": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 111, + 7168 + ] + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 7168 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 7168 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 7168 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 7168 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 7168 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 7168 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 7168 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 7168 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 7168 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 7168 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 7168 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 7168 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 7168 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 7168 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 7168 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 7168 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..a32f167bc7591cf6845dfb4a59c757f0840a0238 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=2304,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2604 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 7168 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 7168 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 7168 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 7168 + ] + }, + "15": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 7168 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 7168 + ] + }, + "17": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 7168 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 7168 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 7168 + ] + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 7168 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 7168 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 7168 + ] + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 7168 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 7168 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 7168 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 7168 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 7168 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 7168 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 7168 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 41, + 7168 + ] + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 42, + 7168 + ] + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 44, + 7168 + ] + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 45, + 7168 + ] + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 7168 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 47, + 7168 + ] + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 48, + 7168 + ] + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 7168 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 7168 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 7168 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 7168 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 7168 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 7168 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 7168 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 7168 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 7168 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 7168 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 7168 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 7168 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 7168 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 7168 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 7168 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 7168 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 7168 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 67, + 7168 + ] + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 7168 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 7168 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 7168 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 7168 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 7168 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 7168 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 7168 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 7168 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 7168 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 7168 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 7168 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 7168 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 7168 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 7168 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 7168 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 7168 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 7168 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 7168 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 7168 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 7168 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 7168 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 7168 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 79, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 7168 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 7168 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 7168 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 7168 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 7168 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 7168 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 7168 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 7168 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 7168 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 7168 + ] + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 7168 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 7168 + ] + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 7168 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 7168 + ] + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 7168 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 7168 + ] + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 7168 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 7168 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 7168 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 7168 + ] + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 7168 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 7168 + ] + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 7168 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 7168 + ] + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 7168 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 7168 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 7168 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 7168 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=256,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=256,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..e452d6db02e0f23f45d86a8a742bcb9c814226c3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=256,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2569 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 7168 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 7168 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 7168 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 7168 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 7168 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 7168 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 7168 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 7168 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 7168 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 7168 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 7168 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 7168 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 7168 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 7168 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 7168 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 7168 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 7168 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 7168 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 7168 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 7168 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 7168 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 7168 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 7168 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 7168 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 7168 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 7168 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 7168 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 7168 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 7168 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 7168 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 7168 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 7168 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 7168 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 7168 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 7168 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 7168 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 7168 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 7168 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 7168 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 7168 + ] + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 7168 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 7168 + ] + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 7168 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 60, + 7168 + ] + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 7168 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 7168 + ] + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 7168 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 7168 + ] + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 7168 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 7168 + ], + "D_DTYPE": 16 + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 7168 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 7168 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 7168 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 7168 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 7168 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 7168 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 7168 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 7168 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 7168 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 7168 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 7168 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 7168 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 7168 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 7168 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 7168 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 7168 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 7168 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 7168 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 7168 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 7168 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 7168 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 7168 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 7168 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 7168 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 7168 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 7168 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 7168 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 7168 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 7168 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 7168 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 7168 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 7168 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 7168 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 7168 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 7168 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 7168 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 7168 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 7168 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 7168 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 7168 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 7168 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 7168 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 7168 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 7168 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 7168 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 7168 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 7168 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 7168 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 7168 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 7168 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 7168 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 7168 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 7168 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 7168 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 7168 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 7168 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 7168 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 7168 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 7168 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 7168 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 7168 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..457de8673d77396c4ee0b8f92960e4a68f8bcdcc --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=256,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2614 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 7168 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 7168 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 7168 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 7168 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 7168 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 7168 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 7168 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 7168 + ] + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 7168 + ] + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 7168 + ] + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 7168 + ] + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 22, + 7168 + ] + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 7168 + ] + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 7168 + ] + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 25, + 7168 + ] + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 7168 + ] + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 7168 + ] + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 7168 + ] + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 7168 + ] + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 7168 + ] + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 7168 + ] + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 7168 + ] + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 7168 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 35, + 7168 + ] + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 7168 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 39, + 7168 + ] + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 7168 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 7168 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 7168 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 46, + 7168 + ] + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 49, + 7168 + ] + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 50, + 7168 + ] + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 7168 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 7168 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 7168 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 7168 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 7168 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 7168 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 7168 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 7168 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 7168 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 7168 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 7168 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 7168 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 7168 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 7168 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 7168 + ] + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 7168 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 7168 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 68, + 7168 + ] + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 69, + 7168 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 7168 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 7168 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 7168 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 7168 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 7168 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 7168 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 7168 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 7168 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 7168 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 7168 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 7168 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 7168 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 7168 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 7168 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 7168 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 7168 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 7168 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 7168 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 7168 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 7168 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 7168 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 7168 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 7168 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 7168 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 7168 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 7168 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 880, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 7168 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 7168 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 7168 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 7168 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 7168 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 7168 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 7168 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 7168 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 7168 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 7168 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 7168 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 7168 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 7168 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 7168 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 7168 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 7168 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 7168 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 7168 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 7168 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 7168 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 7168 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 7168 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 7168 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 160, + "DANGLING_TILES": 64, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 7168 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 7168 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 880, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 7168 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 7168 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=512,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=512,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..defd9d2476a34e2d97a67fe8edda6ee84627541c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=512,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2573 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 1, + 7168 + ], + "D_DTYPE": 16 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 2, + 7168 + ], + "D_DTYPE": 16 + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 3, + 7168 + ], + "D_DTYPE": 16 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 4, + 7168 + ], + "D_DTYPE": 16 + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 5, + 7168 + ], + "D_DTYPE": 16 + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 6, + 7168 + ], + "D_DTYPE": 16 + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 7, + 7168 + ], + "D_DTYPE": 16 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 8, + 7168 + ], + "D_DTYPE": 16 + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 9, + 7168 + ], + "D_DTYPE": 16 + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 10, + 7168 + ], + "D_DTYPE": 16 + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 11, + 7168 + ], + "D_DTYPE": 16 + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 12, + 7168 + ], + "D_DTYPE": 16 + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 13, + 7168 + ], + "D_DTYPE": 16 + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 14, + 7168 + ], + "D_DTYPE": 16 + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 15, + 7168 + ], + "D_DTYPE": 16 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 16, + 7168 + ], + "D_DTYPE": 16 + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 17, + 7168 + ], + "D_DTYPE": 16 + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 18, + 7168 + ], + "D_DTYPE": 16 + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 19, + 7168 + ], + "D_DTYPE": 16 + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 21, + 7168 + ], + "D_DTYPE": 16 + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 22, + 7168 + ], + "D_DTYPE": 16 + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 23, + 7168 + ], + "D_DTYPE": 16 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 24, + 7168 + ], + "D_DTYPE": 16 + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 25, + 7168 + ], + "D_DTYPE": 16 + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 27, + 7168 + ], + "D_DTYPE": 16 + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 33, + 7168 + ], + "D_DTYPE": 16 + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 35, + 7168 + ], + "D_DTYPE": 16 + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 38, + 7168 + ], + "D_DTYPE": 16 + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 39, + 7168 + ], + "D_DTYPE": 16 + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 41, + 7168 + ], + "D_DTYPE": 16 + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 44, + 7168 + ], + "D_DTYPE": 16 + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 45, + 7168 + ], + "D_DTYPE": 16 + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 46, + 7168 + ], + "D_DTYPE": 16 + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 50, + 7168 + ], + "D_DTYPE": 16 + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 51, + 7168 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 7168 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 7168 + ], + "D_DTYPE": 16 + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 54, + 7168 + ], + "D_DTYPE": 16 + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 55, + 7168 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 7168 + ], + "D_DTYPE": 16 + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 57, + 7168 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 7168 + ], + "D_DTYPE": 16 + }, + "59": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 59, + 7168 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 7168 + ], + "D_DTYPE": 16 + }, + "61": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 61, + 7168 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 7168 + ], + "D_DTYPE": 16 + }, + "63": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 63, + 7168 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 7168 + ], + "D_DTYPE": 16 + }, + "65": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 65, + 7168 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 7168 + ] + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 67, + 7168 + ], + "D_DTYPE": 16 + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 68, + 7168 + ], + "D_DTYPE": 16 + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 69, + 7168 + ], + "D_DTYPE": 16 + }, + "70": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 1, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_SHAPE": [ + 70, + 7168 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 7168 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 7168 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 7168 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 7168 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 320, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 7168 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 7168 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 7168 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 7168 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 7168 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 7168 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 7168 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 7168 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 7168 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 7168 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 7168 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 7168 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 7168 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 7168 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 7168 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 7168 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 7168 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 7168 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 7168 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 7168 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 7168 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 7168 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 7168 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 7168 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 7168 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 7168 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 7168 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 7168 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 7168 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 7168 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 7168 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 7168 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 7168 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 7168 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 7168 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 7168 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 7168 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 7168 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 7168 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 1, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 7168 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 7168 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 7168 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 7168 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 7168 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 7168 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 7168 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 7168 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..b7440ab16c4f144fe2bf39a69fe2cdd09fd6e408 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=512,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2596 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 7, + 7168 + ], + "D_DTYPE": 16 + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 8, + 7168 + ], + "D_DTYPE": 16 + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 9, + 7168 + ], + "D_DTYPE": 16 + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 10, + 7168 + ], + "D_DTYPE": 16 + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 11, + 7168 + ], + "D_DTYPE": 16 + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 12, + 7168 + ], + "D_DTYPE": 16 + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 13, + 7168 + ], + "D_DTYPE": 16 + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 14, + 7168 + ], + "D_DTYPE": 16 + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 15, + 7168 + ], + "D_DTYPE": 16 + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 16, + 7168 + ], + "D_DTYPE": 16 + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 17, + 7168 + ], + "D_DTYPE": 16 + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 18, + 7168 + ], + "D_DTYPE": 16 + }, + "19": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 7168 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 7168 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 7168 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 7168 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 7168 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 7168 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 7168 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 33, + 7168 + ] + }, + "34": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 7168 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 400, + "DANGLING_TILES": 48, + "NUM_CUS_STREAMK": 77, + "D_DTYPE": 32, + "D_SHAPE": [ + 38, + 7168 + ] + }, + "39": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 7168 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 7168 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 7168 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 7168 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 7168 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 7168 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 7168 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 52, + 7168 + ] + }, + "53": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 53, + 7168 + ] + }, + "54": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 54, + 7168 + ] + }, + "55": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 7168 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 56, + 7168 + ] + }, + "57": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 57, + 7168 + ] + }, + "58": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 58, + 7168 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 7168 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 7168 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 7168 + ] + }, + "62": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 62, + 7168 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 7168 + ] + }, + "64": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 64, + 7168 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 7168 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 640, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 66, + 7168 + ] + }, + "67": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 7168 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 7168 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 7168 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 7168 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 7168 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 7168 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 7168 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 7168 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 7168 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 7168 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 7168 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 7168 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 7168 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 7168 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 7168 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 7168 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 7168 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 7168 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 7168 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 7168 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 7168 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 7168 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 7168 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 7168 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 7168 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 7168 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 7168 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 7168 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 7168 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 880, + "DANGLING_TILES": 16, + "NUM_CUS_STREAMK": 64, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 7168 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 7168 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 7168 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 7168 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 7168 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 7168 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 7168 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 7168 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 7168 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 7168 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 7168 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 7168 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 7168 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 7168 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 7168 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 7168 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 7168 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 7168 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 7168 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 7168 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 7168 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 7168 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 7168 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 7168 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 7168 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 7168 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 1, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 80, + "DANGLING_TILES": 32, + "NUM_CUS_STREAMK": 74, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 7168 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..01a78cfecc655809c511cff65f7422123252bc5f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=7168,device_name=BW200,dtype=w4a16,group_size=64.json @@ -0,0 +1,2597 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 7168 + ] + }, + "9": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 7168 + ] + }, + "11": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 7168 + ] + }, + "14": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 7168 + ] + }, + "15": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 7168 + ] + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 7168 + ] + }, + "17": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 7168 + ] + }, + "18": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 7168 + ] + }, + "19": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 19, + 7168 + ] + }, + "20": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 20, + 7168 + ] + }, + "21": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 21, + 7168 + ] + }, + "22": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 22, + 7168 + ] + }, + "23": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 23, + 7168 + ] + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 24, + 7168 + ] + }, + "25": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 25, + 7168 + ] + }, + "26": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 26, + 7168 + ] + }, + "27": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 8, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 27, + 7168 + ] + }, + "28": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 28, + 7168 + ] + }, + "29": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 29, + 7168 + ] + }, + "30": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 30, + 7168 + ] + }, + "31": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 31, + 7168 + ] + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 32, + 7168 + ] + }, + "33": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 33, + 7168 + ] + }, + "34": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 34, + 7168 + ] + }, + "35": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 35, + 7168 + ] + }, + "36": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 36, + 7168 + ] + }, + "37": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 37, + 7168 + ] + }, + "38": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 38, + 7168 + ] + }, + "39": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 39, + 7168 + ] + }, + "40": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 40, + 7168 + ] + }, + "41": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 41, + 7168 + ] + }, + "42": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 4, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 42, + 7168 + ] + }, + "43": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 43, + 7168 + ] + }, + "44": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 44, + 7168 + ] + }, + "45": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 45, + 7168 + ] + }, + "46": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 46, + 7168 + ] + }, + "47": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 47, + 7168 + ] + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 48, + 7168 + ] + }, + "49": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 49, + 7168 + ] + }, + "50": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 50, + 7168 + ] + }, + "51": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 51, + 7168 + ] + }, + "52": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 52, + 7168 + ] + }, + "53": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 53, + 7168 + ] + }, + "54": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 54, + 7168 + ] + }, + "55": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 55, + 7168 + ] + }, + "56": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 56, + 7168 + ] + }, + "57": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 57, + 7168 + ] + }, + "58": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 58, + 7168 + ] + }, + "59": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 59, + 7168 + ] + }, + "60": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 60, + 7168 + ] + }, + "61": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 61, + 7168 + ] + }, + "62": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 62, + 7168 + ] + }, + "63": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 63, + 7168 + ] + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 2, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 64, + 7168 + ] + }, + "65": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 65, + 7168 + ] + }, + "66": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 66, + 7168 + ] + }, + "67": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 67, + 7168 + ] + }, + "68": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 68, + 7168 + ] + }, + "69": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "SCHEDULER": 0, + "SPLITK": 2, + "USE_REDUCE_KERNEL": true, + "NUM_CUS": 80, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 69, + 7168 + ] + }, + "70": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 70, + 7168 + ] + }, + "71": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 71, + 7168 + ] + }, + "72": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 72, + 7168 + ] + }, + "73": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 73, + 7168 + ] + }, + "74": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 74, + 7168 + ] + }, + "75": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 75, + 7168 + ] + }, + "76": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 76, + 7168 + ] + }, + "77": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 77, + 7168 + ] + }, + "78": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 78, + 7168 + ] + }, + "79": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 79, + 7168 + ] + }, + "80": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 80, + 7168 + ] + }, + "81": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 81, + 7168 + ] + }, + "82": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 82, + 7168 + ] + }, + "83": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 83, + 7168 + ] + }, + "84": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 84, + 7168 + ] + }, + "85": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 85, + 7168 + ] + }, + "86": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 86, + 7168 + ] + }, + "87": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 87, + 7168 + ] + }, + "88": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 88, + 7168 + ] + }, + "89": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 89, + 7168 + ] + }, + "90": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 90, + 7168 + ] + }, + "91": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 91, + 7168 + ] + }, + "92": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 92, + 7168 + ] + }, + "93": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 93, + 7168 + ] + }, + "94": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 94, + 7168 + ] + }, + "95": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 95, + 7168 + ] + }, + "96": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 96, + 7168 + ] + }, + "97": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 97, + 7168 + ] + }, + "98": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 98, + 7168 + ] + }, + "99": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 99, + 7168 + ] + }, + "100": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 100, + 7168 + ] + }, + "101": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 101, + 7168 + ] + }, + "102": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 102, + 7168 + ] + }, + "103": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 103, + 7168 + ] + }, + "104": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 104, + 7168 + ] + }, + "105": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 105, + 7168 + ] + }, + "106": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 106, + 7168 + ] + }, + "107": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 107, + 7168 + ] + }, + "108": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 108, + 7168 + ] + }, + "109": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 109, + 7168 + ] + }, + "110": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 110, + 7168 + ] + }, + "111": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 111, + 7168 + ] + }, + "112": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 112, + 7168 + ] + }, + "113": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 113, + 7168 + ] + }, + "114": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 114, + 7168 + ] + }, + "115": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 115, + 7168 + ] + }, + "116": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 116, + 7168 + ] + }, + "117": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 117, + 7168 + ] + }, + "118": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 118, + 7168 + ] + }, + "119": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 119, + 7168 + ] + }, + "120": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 120, + 7168 + ] + }, + "121": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 121, + 7168 + ] + }, + "122": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 122, + 7168 + ] + }, + "123": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 123, + 7168 + ] + }, + "124": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 124, + 7168 + ] + }, + "125": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 125, + 7168 + ] + }, + "126": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 126, + 7168 + ] + }, + "127": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 127, + 7168 + ] + }, + "128": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 128, + 7168 + ] + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..c6080e2667bd45f78b1928107fb7d7fa23aa5c88 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/awq_w4a16/awq_gemm_N=7168,K=7168,device_name=K100_AI,dtype=w4a16,group_size=64.json @@ -0,0 +1,2562 @@ +{ + "1": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 1, + 7168 + ] + }, + "2": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 2, + 7168 + ] + }, + "3": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 3, + 7168 + ] + }, + "4": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 4, + 7168 + ] + }, + "5": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 5, + 7168 + ] + }, + "6": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 6, + 7168 + ] + }, + "7": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 7, + 7168 + ] + }, + "8": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 8, + 7168 + ] + }, + "9": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 9, + 7168 + ] + }, + "10": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 10, + 7168 + ] + }, + "11": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 11, + 7168 + ] + }, + "12": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 12, + 7168 + ] + }, + "13": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 13, + 7168 + ] + }, + "14": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 14, + 7168 + ] + }, + "15": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 15, + 7168 + ] + }, + "16": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 16, + 7168 + ] + }, + "17": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 17, + 7168 + ] + }, + "18": { + "USE_REDUCE_KERNEL": false, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 4, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_DTYPE": 32, + "D_SHAPE": [ + 18, + 7168 + ] + }, + "19": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 19, + 7168 + ], + "D_DTYPE": 16 + }, + "20": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 20, + 7168 + ], + "D_DTYPE": 16 + }, + "21": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 21, + 7168 + ], + "D_DTYPE": 16 + }, + "22": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 22, + 7168 + ], + "D_DTYPE": 16 + }, + "23": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 23, + 7168 + ], + "D_DTYPE": 16 + }, + "24": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 24, + 7168 + ], + "D_DTYPE": 16 + }, + "25": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 25, + 7168 + ], + "D_DTYPE": 16 + }, + "26": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 26, + 7168 + ], + "D_DTYPE": 16 + }, + "27": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 27, + 7168 + ], + "D_DTYPE": 16 + }, + "28": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 28, + 7168 + ], + "D_DTYPE": 16 + }, + "29": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 29, + 7168 + ], + "D_DTYPE": 16 + }, + "30": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 30, + 7168 + ], + "D_DTYPE": 16 + }, + "31": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 31, + 7168 + ], + "D_DTYPE": 16 + }, + "32": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 32, + 7168 + ], + "D_DTYPE": 16 + }, + "33": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 33, + 7168 + ], + "D_DTYPE": 16 + }, + "34": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 34, + 7168 + ], + "D_DTYPE": 16 + }, + "35": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 35, + 7168 + ], + "D_DTYPE": 16 + }, + "36": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 36, + 7168 + ], + "D_DTYPE": 16 + }, + "37": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 37, + 7168 + ], + "D_DTYPE": 16 + }, + "38": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 38, + 7168 + ], + "D_DTYPE": 16 + }, + "39": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 39, + 7168 + ], + "D_DTYPE": 16 + }, + "40": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 40, + 7168 + ], + "D_DTYPE": 16 + }, + "41": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 41, + 7168 + ], + "D_DTYPE": 16 + }, + "42": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 42, + 7168 + ], + "D_DTYPE": 16 + }, + "43": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 43, + 7168 + ], + "D_DTYPE": 16 + }, + "44": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 44, + 7168 + ], + "D_DTYPE": 16 + }, + "45": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 45, + 7168 + ], + "D_DTYPE": 16 + }, + "46": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 46, + 7168 + ], + "D_DTYPE": 16 + }, + "47": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 47, + 7168 + ], + "D_DTYPE": 16 + }, + "48": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 48, + 7168 + ], + "D_DTYPE": 16 + }, + "49": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 49, + 7168 + ], + "D_DTYPE": 16 + }, + "50": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 50, + 7168 + ], + "D_DTYPE": 16 + }, + "51": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 51, + 7168 + ], + "D_DTYPE": 16 + }, + "52": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 52, + 7168 + ], + "D_DTYPE": 16 + }, + "53": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 53, + 7168 + ], + "D_DTYPE": 16 + }, + "54": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 54, + 7168 + ], + "D_DTYPE": 16 + }, + "55": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 55, + 7168 + ], + "D_DTYPE": 16 + }, + "56": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 56, + 7168 + ], + "D_DTYPE": 16 + }, + "57": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 57, + 7168 + ], + "D_DTYPE": 16 + }, + "58": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 58, + 7168 + ], + "D_DTYPE": 16 + }, + "59": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 59, + 7168 + ], + "D_DTYPE": 16 + }, + "60": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 60, + 7168 + ], + "D_DTYPE": 16 + }, + "61": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 61, + 7168 + ], + "D_DTYPE": 16 + }, + "62": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 62, + 7168 + ], + "D_DTYPE": 16 + }, + "63": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 63, + 7168 + ], + "D_DTYPE": 16 + }, + "64": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 64, + 7168 + ], + "D_DTYPE": 16 + }, + "65": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 65, + 7168 + ], + "D_DTYPE": 16 + }, + "66": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 66, + 7168 + ], + "D_DTYPE": 16 + }, + "67": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 67, + 7168 + ], + "D_DTYPE": 16 + }, + "68": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 68, + 7168 + ], + "D_DTYPE": 16 + }, + "69": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 69, + 7168 + ], + "D_DTYPE": 16 + }, + "70": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 70, + 7168 + ], + "D_DTYPE": 16 + }, + "71": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 71, + 7168 + ], + "D_DTYPE": 16 + }, + "72": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 72, + 7168 + ], + "D_DTYPE": 16 + }, + "73": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 73, + 7168 + ], + "D_DTYPE": 16 + }, + "74": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 74, + 7168 + ], + "D_DTYPE": 16 + }, + "75": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 75, + 7168 + ], + "D_DTYPE": 16 + }, + "76": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 76, + 7168 + ], + "D_DTYPE": 16 + }, + "77": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 77, + 7168 + ], + "D_DTYPE": 16 + }, + "78": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 78, + 7168 + ], + "D_DTYPE": 16 + }, + "79": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 79, + 7168 + ], + "D_DTYPE": 16 + }, + "80": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 80, + 7168 + ], + "D_DTYPE": 16 + }, + "81": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 81, + 7168 + ], + "D_DTYPE": 16 + }, + "82": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 82, + 7168 + ], + "D_DTYPE": 16 + }, + "83": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 83, + 7168 + ], + "D_DTYPE": 16 + }, + "84": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 84, + 7168 + ], + "D_DTYPE": 16 + }, + "85": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 85, + 7168 + ], + "D_DTYPE": 16 + }, + "86": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 86, + 7168 + ], + "D_DTYPE": 16 + }, + "87": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 87, + 7168 + ], + "D_DTYPE": 16 + }, + "88": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 88, + 7168 + ], + "D_DTYPE": 16 + }, + "89": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 89, + 7168 + ], + "D_DTYPE": 16 + }, + "90": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 90, + 7168 + ], + "D_DTYPE": 16 + }, + "91": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 91, + 7168 + ], + "D_DTYPE": 16 + }, + "92": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 92, + 7168 + ], + "D_DTYPE": 16 + }, + "93": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 93, + 7168 + ], + "D_DTYPE": 16 + }, + "94": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 94, + 7168 + ], + "D_DTYPE": 16 + }, + "95": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 95, + 7168 + ], + "D_DTYPE": 16 + }, + "96": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 96, + 7168 + ], + "D_DTYPE": 16 + }, + "97": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 97, + 7168 + ], + "D_DTYPE": 16 + }, + "98": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 98, + 7168 + ], + "D_DTYPE": 16 + }, + "99": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 99, + 7168 + ], + "D_DTYPE": 16 + }, + "100": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 100, + 7168 + ], + "D_DTYPE": 16 + }, + "101": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 101, + 7168 + ], + "D_DTYPE": 16 + }, + "102": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 102, + 7168 + ], + "D_DTYPE": 16 + }, + "103": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 103, + 7168 + ], + "D_DTYPE": 16 + }, + "104": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 104, + 7168 + ], + "D_DTYPE": 16 + }, + "105": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 105, + 7168 + ], + "D_DTYPE": 16 + }, + "106": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 106, + 7168 + ], + "D_DTYPE": 16 + }, + "107": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 107, + 7168 + ], + "D_DTYPE": 16 + }, + "108": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 108, + 7168 + ], + "D_DTYPE": 16 + }, + "109": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 109, + 7168 + ], + "D_DTYPE": 16 + }, + "110": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 110, + 7168 + ], + "D_DTYPE": 16 + }, + "111": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 111, + 7168 + ], + "D_DTYPE": 16 + }, + "112": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 112, + 7168 + ], + "D_DTYPE": 16 + }, + "113": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 113, + 7168 + ], + "D_DTYPE": 16 + }, + "114": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 114, + 7168 + ], + "D_DTYPE": 16 + }, + "115": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 115, + 7168 + ], + "D_DTYPE": 16 + }, + "116": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 116, + 7168 + ], + "D_DTYPE": 16 + }, + "117": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 117, + 7168 + ], + "D_DTYPE": 16 + }, + "118": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 118, + 7168 + ], + "D_DTYPE": 16 + }, + "119": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 119, + 7168 + ], + "D_DTYPE": 16 + }, + "120": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 120, + 7168 + ], + "D_DTYPE": 16 + }, + "121": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 121, + 7168 + ], + "D_DTYPE": 16 + }, + "122": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 122, + 7168 + ], + "D_DTYPE": 16 + }, + "123": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 123, + 7168 + ], + "D_DTYPE": 16 + }, + "124": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 124, + 7168 + ], + "D_DTYPE": 16 + }, + "125": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 125, + 7168 + ], + "D_DTYPE": 16 + }, + "126": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 126, + 7168 + ], + "D_DTYPE": 16 + }, + "127": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 127, + 7168 + ], + "D_DTYPE": 16 + }, + "128": { + "USE_REDUCE_KERNEL": 0, + "NUM_CUS": 80, + "SCHEDULER": 0, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SPLITK": 1, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2, + "NUM_GROUPS": 1, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "D_SHAPE": [ + 128, + 7168 + ], + "D_DTYPE": 16 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=1536,K=1536,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=1536,K=1536,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..3cf681910a516a66da1a154bf431ea557d545928 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=1536,K=1536,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=1536,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=1536,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..b4d8d53bda30c441caa9b291673e3fa274c0cdf2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=1536,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=2048,K=512,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=2048,K=512,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..5807943daa5654dabf2d370472de8527561a43f1 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=2048,K=512,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=2304,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=2304,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..c8032b3453dc3460d022a741cbc37b57c848848a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=2304,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=256,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=256,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..ce0a7945516b355e7b299bbc34f9633722f2d0d3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=256,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=3072,K=1536,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=3072,K=1536,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..badf7c498b822c958e5e9d75cde937c79a4fa257 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=3072,K=1536,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=4096,K=512,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=4096,K=512,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..5c86171fdaefede3d3ac0465bc346496e2e7563d --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=4096,K=512,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=4608,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=4608,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..d761b9598400c3083541905af73b6cf8a8a39a15 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=4608,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=512,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=512,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..e17716c4b9bfa822dc05e4a7633fb1ef3b93db26 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=512,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": true, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=576,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=576,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..eba70e46337508ee7ff2b89eae6f359508a8b8d9 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=576,K=7168,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=6144,K=3072,arch=gfx938,cu=64,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=6144,K=3072,arch=gfx938,cu=64,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..a3df999a501394785d58b15b444cd40bea0566b4 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=6144,K=3072,arch=gfx938,cu=64,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,79 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=1024,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=1024,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..d2f1fe99c34a8517adc2bac1610f283dd67222fd --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=1024,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=1152,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=1152,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..6a1a1c5e7cbc4d4caaa50119af6c61718263e5cc --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=1152,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=128,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=128,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..b6630561c8dbb64cf3414ba0bb92d9a8130c7aa8 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=128,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "COMBINE_SCALE_LOAD": 0 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 2, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "num_warps": 4, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 1, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 1, + "COMBINE_SCALE_LOAD": 0 + }, + "32768": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2048,arch=gfx938,cu=64,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2048,arch=gfx938,cu=64,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..314953d8368701e42de8e0fdfad36da3f7b81c56 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2048,arch=gfx938,cu=64,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,79 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..2951ecd9da243ca943b3d86b6a9caee86f7cb59a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2304,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2304,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..99337fb39100659118ad72899a62aa072c95e642 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=2304,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 128, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=256,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=256,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..06ea49fdac4f36f84d195e6eda9f38c2947b992c --- /dev/null +++ b/aiter/ops/triton/configs/gemm/block_w8a8/N=7168,K=256,device_name=BW200,dtype=int8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "COMBINE_SCALE_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": true, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW100,group_size=128.json b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW100,group_size=128.json new file mode 100644 index 0000000000000000000000000000000000000000..0f7b6f9d0e1acb7fd360ae14b0369e76de1f123f --- /dev/null +++ b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW100,group_size=128.json @@ -0,0 +1,152 @@ +{ + "128": { + "BLOCK_SIZE_M": 1, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 2, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "1152": { + "BLOCK_SIZE_M": 2, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "7168": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "8416": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "18432": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "25248": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "33664": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "36864": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "37872": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "49152": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "117824": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "131072": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "147456": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "196608": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "229376": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "262144": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "294912": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "917504": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW200,group_size=128.json b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW200,group_size=128.json new file mode 100644 index 0000000000000000000000000000000000000000..3b4a3f8d1649e1ca79407868a2f78253e53f1af5 --- /dev/null +++ b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW200,group_size=128.json @@ -0,0 +1,176 @@ +{ + "14336": { + "BLOCK_SIZE": 128, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE": 128, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "28672": { + "BLOCK_SIZE": 128, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "57344": { + "BLOCK_SIZE": 128, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "114688": { + "BLOCK_SIZE": 512, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "131072": { + "BLOCK_SIZE": 512, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "147456": { + "BLOCK_SIZE": 512, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "229376": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "262144": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "458752": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "524288": { + "BLOCK_SIZE": 1024, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "917504": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "1048576": { + "BLOCK_SIZE": 1024, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "1835008": { + "BLOCK_SIZE": 1024, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "2097152": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "2359296": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "3670016": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "4194304": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "4718592": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "6291456": { + "BLOCK_SIZE": 8192, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "7340032": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "8388608": { + "BLOCK_SIZE": 8192, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "14680064": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16777216": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "18874368": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "25165824": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "29360128": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "58720256": { + "BLOCK_SIZE": 4096, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "117440512": { + "BLOCK_SIZE": 4096, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW200,group_size=64.json b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW200,group_size=64.json new file mode 100644 index 0000000000000000000000000000000000000000..b4b33acfb4d3713dd365a2ba22a22bf4505aa098 --- /dev/null +++ b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=BW200,group_size=64.json @@ -0,0 +1,86 @@ +{ + "14336": { + "BLOCK_SIZE": 256, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "28672": { + "BLOCK_SIZE": 128, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "57344": { + "BLOCK_SIZE": 512, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "114688": { + "BLOCK_SIZE": 1024, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "229376": { + "BLOCK_SIZE": 1024, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "458752": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "917504": { + "BLOCK_SIZE": 2048, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "1835008": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "3670016": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "7340032": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "14680064": { + "BLOCK_SIZE": 4096, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "29360128": { + "BLOCK_SIZE": 4096, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + }, + "58720256": { + "BLOCK_SIZE": 4096, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 2 + }, + "117440512": { + "BLOCK_SIZE": 4096, + "num_warps": 1, + "num_ctas": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=K100_AI,group_size=128.json b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=K100_AI,group_size=128.json new file mode 100644 index 0000000000000000000000000000000000000000..5791b7e947e361baf139a772c51a63167cbb0b76 --- /dev/null +++ b/aiter/ops/triton/configs/group_quant/w8a8_per_token_group_quant_device_name=K100_AI,group_size=128.json @@ -0,0 +1,92 @@ +{ + "2": { + "BLOCK_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 1, + "num_warps": 1, + "num_stages": 1 + }, + "18": { + "BLOCK_SIZE_M": 1, + "num_warps": 1, + "num_stages": 1 + }, + "56": { + "BLOCK_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 3, + "num_warps": 1, + "num_stages": 2 + }, + "4208": { + "BLOCK_SIZE_M": 3, + "num_warps": 1, + "num_stages": 1 + }, + "8416": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2 + }, + "18432": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + }, + "25248": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2 + }, + "33664": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + }, + "37872": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + }, + "117824": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + }, + "196608": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + }, + "262144": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + }, + "294912": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + }, + "917504": { + "BLOCK_SIZE_M": 4, + "num_warps": 1, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/hstu_attn/MI300X-HSTU_ATTN_BWD.json b/aiter/ops/triton/configs/hstu_attn/MI300X-HSTU_ATTN_BWD.json new file mode 100644 index 0000000000000000000000000000000000000000..0e03d3c0d34722f816f703c05e41464fb751b770 --- /dev/null +++ b/aiter/ops/triton/configs/hstu_attn/MI300X-HSTU_ATTN_BWD.json @@ -0,0 +1,25 @@ +{ + "small_batch": { + "BLOCK_M": 32, + "BLOCK_N": 64, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2, + "SEQUENCE_PARALLEL": true, + "UNROLL": 1 + }, + + "large_batch": { + "BLOCK_M": 32, + "BLOCK_N": 64, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2, + "SEQUENCE_PARALLEL": false, + "UNROLL": 1 + } +} diff --git a/aiter/ops/triton/configs/hstu_attn/MI300X-HSTU_ATTN_FWD.json b/aiter/ops/triton/configs/hstu_attn/MI300X-HSTU_ATTN_FWD.json new file mode 100644 index 0000000000000000000000000000000000000000..8719ea545b82f82f1a8558dc9e5fca221b766bff --- /dev/null +++ b/aiter/ops/triton/configs/hstu_attn/MI300X-HSTU_ATTN_FWD.json @@ -0,0 +1,31 @@ +{ + "small_batch": { + "BLOCK_M": 64, + "BLOCK_N": 32, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + + "batch_512": { + "BLOCK_M": 128, + "BLOCK_N": 32, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + + "large_batch": { + "BLOCK_M": 64, + "BLOCK_N": 32, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/hstu_attn/MI350X-HSTU_ATTN_BWD.json b/aiter/ops/triton/configs/hstu_attn/MI350X-HSTU_ATTN_BWD.json new file mode 100644 index 0000000000000000000000000000000000000000..9302e84285d6e990056aee80679e7cfb0c37c672 --- /dev/null +++ b/aiter/ops/triton/configs/hstu_attn/MI350X-HSTU_ATTN_BWD.json @@ -0,0 +1,25 @@ +{ + "small_batch": { + "BLOCK_M": 32, + "BLOCK_N": 64, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 1, + "SEQUENCE_PARALLEL": true, + "UNROLL": 1 + }, + + "large_batch": { + "BLOCK_M": 32, + "BLOCK_N": 64, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1, + "SEQUENCE_PARALLEL": false, + "UNROLL": 1 + } +} diff --git a/aiter/ops/triton/configs/hstu_attn/MI350X-HSTU_ATTN_FWD.json b/aiter/ops/triton/configs/hstu_attn/MI350X-HSTU_ATTN_FWD.json new file mode 100644 index 0000000000000000000000000000000000000000..4864153937af59f9665b42d7eca1ab60bb0906d0 --- /dev/null +++ b/aiter/ops/triton/configs/hstu_attn/MI350X-HSTU_ATTN_FWD.json @@ -0,0 +1,31 @@ +{ + "small_batch": { + "BLOCK_M": 64, + "BLOCK_N": 32, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + + "batch_512": { + "BLOCK_M": 128, + "BLOCK_N": 32, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + + "large_batch": { + "BLOCK_M": 128, + "BLOCK_N": 32, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=176,device_name=BW200B,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=128,N=176,device_name=BW200B,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..a902684d7450bd1481f545f7c3b859e2dd744023 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=176,device_name=BW200B,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=176,device_name=BW200B.json b/aiter/ops/triton/configs/moe/E=128,N=176,device_name=BW200B.json new file mode 100644 index 0000000000000000000000000000000000000000..32804de2bf0437cfca83dd6b79dfcf07b989f336 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=176,device_name=BW200B.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=192,device_name=BW200,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=128,N=192,device_name=BW200,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..583e22c726473a4d88bb4730084338ebffb429e7 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=192,device_name=BW200,is_bottom=True.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=192,device_name=BW200.json b/aiter/ops/triton/configs/moe/E=128,N=192,device_name=BW200.json new file mode 100644 index 0000000000000000000000000000000000000000..568765c2afb255b91e3438a401d07b55f05bc17d --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=192,device_name=BW200.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/E=128,N=352,device_name=BW200B,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=128,N=352,device_name=BW200B,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..c001f267e742bf5c5b17feffebc5a11c02ab9dfe --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=352,device_name=BW200B,is_bottom=True.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=352,device_name=BW200B.json b/aiter/ops/triton/configs/moe/E=128,N=352,device_name=BW200B.json new file mode 100644 index 0000000000000000000000000000000000000000..df061678e3bcc11fc4b720e5c83293b307d46afd --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=352,device_name=BW200B.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW100.json b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW100.json new file mode 100644 index 0000000000000000000000000000000000000000..03f77aa4f5bd67f847ad1db493388d52fd10662e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW100.json @@ -0,0 +1,35 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..25e2eefaa347436506b40b2c0470f293584a18a4 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200.json b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200.json new file mode 100644 index 0000000000000000000000000000000000000000..3c99cfb90f691a22a08b9e0d1155538c4aab13ad --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200B,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200B,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..b0686df49e5c683037eb3f8083cd2987c1b407de --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200B,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200B.json b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200B.json new file mode 100644 index 0000000000000000000000000000000000000000..336b2f2f08af03258ed51a15477697bdba4fcc20 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=384,device_name=BW200B.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..89f8cccc4a7a0ccd15842c815d0f2a5bc853e4d7 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200.json b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200.json new file mode 100644 index 0000000000000000000000000000000000000000..d80a1d256103cab37b89269597a915fbd73ebc01 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200B,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200B,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..e6bfd099f237f9a9e982bf0c0584f52b0f3d193a --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200B,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200B.json b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200B.json new file mode 100644 index 0000000000000000000000000000000000000000..c4540bf8b2951483c1403293fa6ec181a19f896c --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=128,N=768,device_name=BW200B.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=129,N=352,device_name=BW200B,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=129,N=352,device_name=BW200B,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..32b174d68565f5c188174885413398853f81ee62 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=129,N=352,device_name=BW200B,is_bottom=True.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=129,N=352,device_name=BW200B.json b/aiter/ops/triton/configs/moe/E=129,N=352,device_name=BW200B.json new file mode 100644 index 0000000000000000000000000000000000000000..7e7d674fc70f66c4f4bb3d3a1a8d70dc2b2596cc --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=129,N=352,device_name=BW200B.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..17484a7d342b176fb69d769ae303def3e104b276 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..c6aa62813c4c261cca2c990f4c07a6eeaa35f21b --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int4_w4a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..e28c55b3e8ddb0e59369b4c11d18e86cf9a76428 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..a0eec6dd1a8a96d47d48fb40acc37c57b74570e0 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=16,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=160,N=640,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=160,N=640,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..9f13cb936d5f6f03d14380053fc924702c08f628 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=160,N=640,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=160,N=640,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=160,N=640,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..8757e36396c9b48c18a10fd39f7364d35d5be956 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=160,N=640,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..7c06702a44378ce132b1b441f0df02704581e816 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..b7992efe5c1303d28a6d697125f44ea9d4ef1431 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int4_w4a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..949cf3d522ba5a40d68e0ac077e76bc3b2367d50 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..ccbfe8a09282336c94e30da4e323d43a020011c0 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=2,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=1024,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=1024,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..66f6cffb558a0f5a152a9be35c86bc35ecd1ba76 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=1024,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=1024,device_name=BW200,dtype=int4_w4a8.json b/aiter/ops/triton/configs/moe/E=256,N=1024,device_name=BW200,dtype=int4_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..4bf6d23e5c6354331860b2b950a798a8c32581a1 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=1024,device_name=BW200,dtype=int4_w4a8.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..d4d02f6e6ecb59a44532b93c9740044e74b6b430 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int4_w4a8.json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int4_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..4098025ece00f94fa9f0be04bc7e7c3f9848b1af --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int4_w4a8.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..c82dbb8dcf3d5976d37bb86046bcf5e13b649523 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..2ab2cb5ca65444388c9660973edc73ce9858a6a4 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..714a7b819fb4f487822ac7bdfd5a3a140f8c2f05 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8.json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..152525495b85ba121c94a195ceedc4a8174a760f --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200,dtype=int8_w8a8.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..8bdefa8dc60b37dd260723d44a857e3d6bcff379 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,178 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..4660276a548679e14769301719faa9e1efd960bf --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,178 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..8bdefa8dc60b37dd260723d44a857e3d6bcff379 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,178 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..4660276a548679e14769301719faa9e1efd960bf --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,178 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..f7b87eb57128cd52ed1f0227d74b7d7ec72f0e5e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8.json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..0c581cfc78c63a26d554f2a926178ae7eb6ca08a --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=BW200B,dtype=int8_w8a8.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..f6e29e2c28d6bbf938a6b4c9681a471a8e503c01 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..0e58648ca1bd6b9f083f952dba50fe614596de28 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=128,device_name=K100_AI,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=192,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=192,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/ops/triton/configs/moe/E=256,N=192,device_name=BW200B,dtype=int8_w8a8.json b/aiter/ops/triton/configs/moe/E=256,N=192,device_name=BW200B,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/ops/triton/configs/moe/E=256,N=2048,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=2048,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..514d9558d313f18b4901b8956c49ba50b39b5724 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=2048,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=2048,device_name=BW200,dtype=int4_w4a8.json b/aiter/ops/triton/configs/moe/E=256,N=2048,device_name=BW200,dtype=int4_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..5f5850cb4ee48d835e8186ef7ca795e3367f0007 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=2048,device_name=BW200,dtype=int4_w4a8.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..59f80a967d13fe3c3a9dab726c985adc76c01fab --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,172 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..52fb34e638cfdac8de39610d4a1e4b189b64e49e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,172 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..002b9bc8e8b52cc0adcdec08fb889e1d43a8189e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..287fdeacb5b3b41526aff725a0967d4221e23c7f --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a16.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..eeef6dec3e934d37f3126a01dfdda7f2b7324f5b --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a8.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..8e57c261154291141acc6c32049df13806603599 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int4_w4a8.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..59f80a967d13fe3c3a9dab726c985adc76c01fab --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,172 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..52fb34e638cfdac8de39610d4a1e4b189b64e49e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,172 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..714a7b819fb4f487822ac7bdfd5a3a140f8c2f05 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..7fee24b6123614bdb001525d640d241caa786aed --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,dtype=int8_w8a8.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..d4eab97cdbcc0351a708c4b9971991bcba5b35f7 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200,is_bottom=True.json @@ -0,0 +1,172 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200.json new file mode 100644 index 0000000000000000000000000000000000000000..ae1265a90447ea0f09519dadae79adf8c4082763 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200.json @@ -0,0 +1,172 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..5e56f50812fa027c5493fc733244ff7990eb3594 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,236 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..91e0449f19d6f04c91ac5fb9c87aa15151646ea8 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,236 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..8673cee30add80bd47b4f0986dc323c851aac493 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,152 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..d469771170dc53cc9cf67f83eababc121e710d3e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int4_w4a16.json @@ -0,0 +1,152 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..4b11a66a4e6b3d1c3a655f636c2b289572d5b60c --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..f5feed3e5e282a1b28b17a6289858696a5c9d7cc --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..a516e2b4baaae03041772c92466d13c6e26d5bbd --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI,is_bottom=True.json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI.json b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI.json new file mode 100644 index 0000000000000000000000000000000000000000..a5aa82da4a64bac5d8304da6a8d79186108c7227 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=256,device_name=K100_AI.json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=320,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=320,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..8c684a26facc921cdb254a8e25c680a8d15dee88 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=320,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=320,device_name=BW200B,dtype=int8_w8a8.json b/aiter/ops/triton/configs/moe/E=256,N=320,device_name=BW200B,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..2faab3d4b5177654ebc76eb08cf2d38230711da2 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=320,device_name=BW200B,dtype=int8_w8a8.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=384,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=384,device_name=BW200B,dtype=int8_w8a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/ops/triton/configs/moe/E=256,N=384,device_name=BW200B,dtype=int8_w8a8.json b/aiter/ops/triton/configs/moe/E=256,N=384,device_name=BW200B,dtype=int8_w8a8.json new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/ops/triton/configs/moe/E=256,N=512,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=512,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..950100c1e04c125d36855fee975472099447fe6e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=512,device_name=BW200,dtype=int4_w4a8,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=512,device_name=BW200,dtype=int4_w4a8.json b/aiter/ops/triton/configs/moe/E=256,N=512,device_name=BW200,dtype=int4_w4a8.json new file mode 100644 index 0000000000000000000000000000000000000000..7a979d160e38b701288ffce82a1de8e7a10db81b --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=512,device_name=BW200,dtype=int4_w4a8.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=64,device_name=K100_AI,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=256,N=64,device_name=K100_AI,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..cd4c46475138e9c9dbe42758208ef3dcdb2f6f54 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=64,device_name=K100_AI,is_bottom=True.json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 16, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=256,N=64,device_name=K100_AI.json b/aiter/ops/triton/configs/moe/E=256,N=64,device_name=K100_AI.json new file mode 100644 index 0000000000000000000000000000000000000000..5d6344f7365bb8f0476f0876f03ee63ed42b0a79 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=256,N=64,device_name=K100_AI.json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=257,N=256,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=257,N=256,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..fd6902b7ddf713dd816eb9afbd112920264af110 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=257,N=256,device_name=BW200B,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,236 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=257,N=256,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=257,N=256,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..e0b22f3885397b1a0de713db354787194954b934 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=257,N=256,device_name=BW200B,dtype=fp8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,236 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..a20d82001c7eb384e4b30c9e2579ab99822ab6d6 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..1e778a7a4b597765e518608c24171e9ce8a62bff --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,dtype=int4_w4a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..51a7c1e85a4e1a04a9a48511898c240619701b96 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200,is_bottom=True.json @@ -0,0 +1,152 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200.json b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200.json new file mode 100644 index 0000000000000000000000000000000000000000..4f94eb173f8345c3b910970aeb49fc0329e1a766 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=BW200.json @@ -0,0 +1,152 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=K100_AI,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=K100_AI,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..1d8ec404cd1ee854a23308ee9f1a00d0ebdc0dea --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=K100_AI,is_bottom=True.json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 2, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=K100_AI.json b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=K100_AI.json new file mode 100644 index 0000000000000000000000000000000000000000..4981511f1538f2bdca3c144dcc5718c1dea8ca77 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=32,N=2048,device_name=K100_AI.json @@ -0,0 +1,162 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "num_warps": 16, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 8, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "num_warps": 16, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=384,N=256,device_name=BW200B,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=384,N=256,device_name=BW200B,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..db10d87474f6659a2e764abbca1bcab7995b04b2 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=384,N=256,device_name=BW200B,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=384,N=256,device_name=BW200B,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=384,N=256,device_name=BW200B,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..7972630c057793d3736b1d2939e0a15c1c90b087 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=384,N=256,device_name=BW200B,dtype=int4_w4a16.json @@ -0,0 +1,223 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=384,N=512,device_name=BW200B,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=384,N=512,device_name=BW200B,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..df19445b31ec3cf455e580bf816e8fc093542f14 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=384,N=512,device_name=BW200B,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=384,N=512,device_name=BW200B,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=384,N=512,device_name=BW200B,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..ce768ddb076eb5c5411006d112eebc09329ae5f1 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=384,N=512,device_name=BW200B,dtype=int4_w4a16.json @@ -0,0 +1,210 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "USE_MLS_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..3162ba83e54a804673b5e1685635e70eb74569b1 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..3c7e9c8f7b4a03f6255550c0bcc0afc70f4f62ce --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int4_w4a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..95e6c01e8c55e877ec86585b4b2c10992e5c6c6d --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..2a74f9e9d695d3753c373b3cb9ffa219ea7facdc --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=4,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW100,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW100,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..f90fe85714d6422af66201b9b27efae1c52bde6e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW100,is_bottom=True.json @@ -0,0 +1,130 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW100.json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW100.json new file mode 100644 index 0000000000000000000000000000000000000000..bf06688133de055c7c5210330bc9a0404ace239d --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW100.json @@ -0,0 +1,130 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..dde83aad61ae0072cc1b6d6e287c10efa9de756d --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int4_w4a16,is_bottom=True.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int4_w4a16.json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int4_w4a16.json new file mode 100644 index 0000000000000000000000000000000000000000..4363cec15d1e61612f7a17c38f8b8b2cf7308cc5 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int4_w4a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 8, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..4ed34e5181095ef339c30d085e2d3e03c4510d2e --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": true, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json new file mode 100644 index 0000000000000000000000000000000000000000..fb01b09cfa9991cd9d7db20c2240bfa96b574473 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,dtype=int8_w8a8,is_bottom=True,block_shape=[128,128].json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 512, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 2, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "32768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "65536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,is_bottom=True.json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,is_bottom=True.json new file mode 100644 index 0000000000000000000000000000000000000000..fb19dce1a3f59333833c9a2d4d8744f168421ea3 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200,is_bottom=True.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "llvm-iglp-8", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 2, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 8, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200.json b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200.json new file mode 100644 index 0000000000000000000000000000000000000000..d23179bfd7acfa06c396a0cae50c6866baeeb199 --- /dev/null +++ b/aiter/ops/triton/configs/moe/E=8,N=2048,device_name=BW200.json @@ -0,0 +1,206 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 2, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 4, + "num_stages": 1 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "none", + "sched_latency": "none", + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + }, + "32768": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "COMBINE_SCALE_LOAD": false, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "kpack": 1, + "num_warps": 16, + "num_stages": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/MI300X-MOE-DEFAULT.json b/aiter/ops/triton/configs/moe/MI300X-MOE-DEFAULT.json new file mode 100644 index 0000000000000000000000000000000000000000..68146be44aafd3be7c2587ae84559b96a26495d3 --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI300X-MOE-DEFAULT.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "large_M": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/MI300X-MOE-FP8_W8A8.json b/aiter/ops/triton/configs/moe/MI300X-MOE-FP8_W8A8.json new file mode 100644 index 0000000000000000000000000000000000000000..643964ca95b606062180b9500a8ebc13f9618d8a --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI300X-MOE-FP8_W8A8.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/MI300X-MOE-INT4_W4A16.json b/aiter/ops/triton/configs/moe/MI300X-MOE-INT4_W4A16.json new file mode 100644 index 0000000000000000000000000000000000000000..f460ab03fb436af18a1b37deda486d57e596429b --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI300X-MOE-INT4_W4A16.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/MI300X-MOE-INT8_W8A16.json b/aiter/ops/triton/configs/moe/MI300X-MOE-INT8_W8A16.json new file mode 100644 index 0000000000000000000000000000000000000000..ab634ae7d3abaaaafe17a971c134a8250a8a6362 --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI300X-MOE-INT8_W8A16.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/MI300X-MOE-INT8_W8A8.json b/aiter/ops/triton/configs/moe/MI300X-MOE-INT8_W8A8.json new file mode 100644 index 0000000000000000000000000000000000000000..643964ca95b606062180b9500a8ebc13f9618d8a --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI300X-MOE-INT8_W8A8.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/aiter/ops/triton/configs/moe/MI300X-MOE_ROUTING_SIGMOID_TOPK1.json b/aiter/ops/triton/configs/moe/MI300X-MOE_ROUTING_SIGMOID_TOPK1.json new file mode 100644 index 0000000000000000000000000000000000000000..79b2e29247197b89d29bd84b37cfa75f0eb2dd9d --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI300X-MOE_ROUTING_SIGMOID_TOPK1.json @@ -0,0 +1,70 @@ +{ + "N16": { + "small" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "kpack": 1 + }, + "medium" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "kpack": 1 + }, + "large" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "kpack": 2 + }, + "xlarge" :{ + "BLOCK_M": 32, + "BLOCK_K": 128, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "kpack": 2 + } + }, + "N128": { + "small" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 0, + "kpack": 1 + }, + "medium" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 0, + "kpack": 2 + }, + "large" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "kpack": 2 + }, + "xlarge" :{ + "BLOCK_M": 32, + "BLOCK_K": 128, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "kpack": 2 + } + } +} diff --git a/aiter/ops/triton/configs/moe/MI350X-MOE-DEFAULT.json b/aiter/ops/triton/configs/moe/MI350X-MOE-DEFAULT.json new file mode 100644 index 0000000000000000000000000000000000000000..e4b2bc7ac9ee32be291cd3838f15d9ea0cc46340 --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI350X-MOE-DEFAULT.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "large_M": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/MI350X-MOE-FP8_W8A8.json b/aiter/ops/triton/configs/moe/MI350X-MOE-FP8_W8A8.json new file mode 100644 index 0000000000000000000000000000000000000000..517d271f0a1ecda5eac5819131036ff172c7177c --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI350X-MOE-FP8_W8A8.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/MI350X-MOE-INT4_W4A16.json b/aiter/ops/triton/configs/moe/MI350X-MOE-INT4_W4A16.json new file mode 100644 index 0000000000000000000000000000000000000000..7a5b986e753c33fa188de5a39a90feb2ebdfc1fe --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI350X-MOE-INT4_W4A16.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/MI350X-MOE-INT8_W8A16.json b/aiter/ops/triton/configs/moe/MI350X-MOE-INT8_W8A16.json new file mode 100644 index 0000000000000000000000000000000000000000..7394edb931b9d752f6a0598f719ea141e84c8db1 --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI350X-MOE-INT8_W8A16.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/MI350X-MOE-INT8_W8A8.json b/aiter/ops/triton/configs/moe/MI350X-MOE-INT8_W8A8.json new file mode 100644 index 0000000000000000000000000000000000000000..517d271f0a1ecda5eac5819131036ff172c7177c --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI350X-MOE-INT8_W8A8.json @@ -0,0 +1,35 @@ +{ + "small_M": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "medium_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "large_M": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/aiter/ops/triton/configs/moe/MI350X-MOE_ROUTING_SIGMOID_TOPK1.json b/aiter/ops/triton/configs/moe/MI350X-MOE_ROUTING_SIGMOID_TOPK1.json new file mode 100644 index 0000000000000000000000000000000000000000..79b2e29247197b89d29bd84b37cfa75f0eb2dd9d --- /dev/null +++ b/aiter/ops/triton/configs/moe/MI350X-MOE_ROUTING_SIGMOID_TOPK1.json @@ -0,0 +1,70 @@ +{ + "N16": { + "small" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "kpack": 1 + }, + "medium" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "kpack": 1 + }, + "large" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 3, + "kpack": 2 + }, + "xlarge" :{ + "BLOCK_M": 32, + "BLOCK_K": 128, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "kpack": 2 + } + }, + "N128": { + "small" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 0, + "kpack": 1 + }, + "medium" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 0, + "kpack": 2 + }, + "large" :{ + "BLOCK_M": 16, + "BLOCK_K": 256, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "kpack": 2 + }, + "xlarge" :{ + "BLOCK_M": 32, + "BLOCK_K": 128, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "kpack": 2 + } + } +} diff --git a/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=128-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=auto.json b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=128-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..6a3130f4e592c40c4940c067420f327e7eecb022 --- /dev/null +++ b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=128-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=auto.json @@ -0,0 +1,44 @@ +{ + "config": { + "1": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE": 64, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + } + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=128-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=fp8.json b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=128-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=fp8.json new file mode 100644 index 0000000000000000000000000000000000000000..6663c0c0429aba53a26519943727cfbab4674c2c --- /dev/null +++ b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=128-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=fp8.json @@ -0,0 +1,44 @@ +{ + "config": { + "1": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE": 64, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE": 128, + "waves_per_eu": 1, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "USE_MATRIX_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE": 64, + "waves_per_eu": 1, + "instruction_sched_variant": "none", + "sched_latency": "mmac5-ds10", + "USE_MATRIX_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + } + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=32-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=auto.json b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=32-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..dc6ff9010100d977633775235d4963c558a61da8 --- /dev/null +++ b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=32-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=auto.json @@ -0,0 +1,44 @@ +{ + "config": { + "1": { + "BLOCK_SIZE": 16, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "8": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "local-prefetch", + "sched_latency": "mmac5-ds10", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + } + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=32-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=fp8.json b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=32-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=fp8.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd02e457263be52dcc44f957f928515265778ca --- /dev/null +++ b/aiter/ops/triton/configs/paged_attention_2d/paged_attention_2d-device=gfx938-CACHE_BLOCK_SIZE=32-HEAD_SIZE_PADDED=128-SLIDING_WINDOW=0-USE_ALIBI_SLOPES=False-HEAD_DIM_PAD_REQ=False-kv_dtype=fp8.json @@ -0,0 +1,44 @@ +{ + "config": { + "1": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "none", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "4": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "none", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "none", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE": 32, + "waves_per_eu": 1, + "instruction_sched_variant": "none", + "sched_latency": "none", + "USE_MATRIX_LOAD": false, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + } + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/sage_attention/_attn_fwd-device=gfx936 b/aiter/ops/triton/configs/sage_attention/_attn_fwd-device=gfx936 new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/ops/triton/configs/sage_attention/quant_per_block_int8_kernel-device=gfx936 b/aiter/ops/triton/configs/sage_attention/quant_per_block_int8_kernel-device=gfx936 new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx936-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx936-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..6c4ec97c9ecf98c470d5278a67441a2a3df6e400 --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx936-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "8": { + "BLOCK_M": 32, + "TILE_SIZE": 16, + "num_warps": 4, + "num_stages": 1 + } + } +} diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx936-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx936-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..7bf1f76d717bd45a8251c6a869b1787632ed0a86 --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx936-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "16": { + "BLOCK_M": 32, + "TILE_SIZE": 16, + "num_warps": 4, + "num_stages": 1 + } + } +} diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx938-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx938-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..14af234fb6404b4567350156771ef44710e96603 --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx938-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "8": { + "BLOCK_M": 16, + "TILE_SIZE": 16, + "num_warps": 4, + "num_stages": 2 + } + } +} diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx938-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx938-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..6c58405fa047f65bcb372684c67730af2a2e2a9e --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_2d-device=gfx938-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-fp8=0-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "16": { + "BLOCK_M": 16, + "TILE_SIZE": 64, + "num_warps": 4, + "num_stages": 1 + } + } +} diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx936-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-seg=16-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx936-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-seg=16-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..6b5fbe3ff1951982c5b5000b9c0059bd852c6b2f --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx936-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-seg=16-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "8": { + "BLOCK_M": 16, + "TILE_SIZE": 16, + "num_stages": 2, + "num_warps": 4 + } + } +} diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx936-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-seg=16-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx936-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-seg=16-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..ae7a3c201c1a59eb4f028ecbd79d250da747992a --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx936-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-seg=16-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "16": { + "BLOCK_M": 16, + "TILE_SIZE": 16, + "num_stages": 1, + "num_warps": 4 + } + } + } diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx938-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-seg=16-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx938-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-seg=16-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..3679ddf2bfccef4f5454547235475847bbb9bf0f --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx938-bs=16-hs=192-sw=128-alibi=0-qq=0-softcap=0-sinks=1-mm=0-seg=16-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "8": { + "BLOCK_M": 16, + "TILE_SIZE": 64, + "num_stages": 1, + "num_warps": 4 + } + } + } diff --git a/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx938-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-seg=16-hsp=256-pad=1-kv=auto.json b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx938-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-seg=16-hsp=256-pad=1-kv=auto.json new file mode 100644 index 0000000000000000000000000000000000000000..5da3548e88b5319b0d54815861e763364e5b2237 --- /dev/null +++ b/aiter/ops/triton/configs/unified_attention/unified_attention_3d-device=gfx938-bs=32-hs=192-sw=0-alibi=0-qq=0-softcap=0-sinks=0-mm=0-seg=16-hsp=256-pad=1-kv=auto.json @@ -0,0 +1,10 @@ +{ + "config": { + "16": { + "BLOCK_M": 16, + "TILE_SIZE": 64, + "num_stages": 1, + "num_warps": 4 + } + } + } diff --git a/aiter/ops/triton/extend_attention.py b/aiter/ops/triton/extend_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..9a8e057f29e872cc55b48c5524908deb926a7cec --- /dev/null +++ b/aiter/ops/triton/extend_attention.py @@ -0,0 +1,526 @@ +# Copyright (C) 2023-2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +Memory-efficient attention for prefill. +It supports page size = 1 and prefill with KV cache (i.e. extend). +""" + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl + +import os +from triton.knobs import cache as cache_knob + +from aiter.ops.triton.prefill_attention import context_attention_fwd +from aiter.ops.triton.activation import _tanh +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.jit +def _fwd_kernel( + Q_Extend, + K_Extend, + V_Extend, + O_Extend, + K_Buffer, + V_Buffer, + qo_indptr, + kv_indptr, + kv_indices, + mask_ptr, + mask_indptr, + sm_scale, + kv_group_num, + stride_qbs, + stride_qh, + stride_kbs, + stride_kh, + stride_vbs, + stride_vh, + stride_obs, + stride_oh, + stride_buf_kbs, + stride_buf_kh, + stride_buf_vbs, + stride_buf_vh, + logit_cap: tl.constexpr, + Lq: tl.constexpr, + Lv: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DPE: tl.constexpr, + BLOCK_DV: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + USE_CUSTOM_MASK: tl.constexpr, + IS_CAUSAL: tl.constexpr, + SKIP_PREFIX_CUSTOM_MASK: tl.constexpr, + STORE_TRANSPOSE: tl.constexpr, +): + cur_seq = tl.program_id(0) + cur_head = tl.program_id(1) + cur_block_m = tl.program_id(2) + cur_kv_head = cur_head // kv_group_num + + cur_seq_extend_start_idx = tl.load(qo_indptr + cur_seq) + cur_seq_len_extend = tl.load(qo_indptr + cur_seq + 1) - cur_seq_extend_start_idx + cur_seq_kv_start_idx = tl.load(kv_indptr + cur_seq) + cur_seq_len_prefix = tl.load(kv_indptr + cur_seq + 1) - cur_seq_kv_start_idx + cur_seq_len = cur_seq_len_prefix + cur_seq_len_extend + + if USE_CUSTOM_MASK: + cur_seq_mask_start_idx = tl.load(mask_indptr + cur_seq) + + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_dv = tl.arange(0, BLOCK_DV) + offs_m = tl.arange(0, BLOCK_M) + mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend + + mask_d = offs_d < Lq + mask_dv = offs_dv < Lv + + offs_q = ( + (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None]) + * stride_qbs + + cur_head * stride_qh + + offs_d[None, :] + ) + q = tl.load( + Q_Extend + offs_q, mask=(mask_m[:, None]) & (mask_d[None, :]), other=0.0 + ) + + if BLOCK_DPE > 0: + offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE) + offs_qpe = ( + (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None]) + * stride_qbs + + cur_head * stride_qh + + offs_dpe[None, :] + ) + qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0) + + # stage 1: compute scores with prefix + offs_n = tl.arange(0, BLOCK_N) + + acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32) + deno = tl.zeros([BLOCK_M], dtype=tl.float32) + e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + + for start_n in range(0, cur_seq_len_prefix, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + mask_n = (start_n + offs_n) < cur_seq_len_prefix + + offs_kv_loc = tl.load( + kv_indices + cur_seq_kv_start_idx + start_n + offs_n, mask=mask_n, other=0 + ) + + # load k in transposed way + offs_buf_k = ( + offs_kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_d[:, None] + ) + k = tl.load( + K_Buffer + offs_buf_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0 + ) + + qk = tl.dot(q.to(k.dtype), k) + if BLOCK_DPE > 0: + offs_kpe = ( + offs_kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_dpe[:, None] + ) + kpe = tl.load( + K_Buffer + offs_kpe, + mask=mask_n[None, :], + other=0.0, + ) + qk += tl.dot(qpe.to(kpe.dtype), kpe) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * _tanh(qk / logit_cap) + + if USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK: + custom_mask = tl.load( + mask_ptr + + cur_seq_mask_start_idx + + (cur_block_m * BLOCK_M + offs_m[:, None]) * cur_seq_len + + start_n + + offs_n[None, :], + mask=(mask_m[:, None] & mask_n[None, :]), + other=0, + ) + custom_mask &= mask_m[:, None] & mask_n[None, :] + qk = tl.where(custom_mask, qk, float("-inf")) + else: + qk = tl.where(mask_m[:, None] & mask_n[None, :], qk, float("-inf")) + + n_e_max = tl.maximum(tl.max(qk, 1), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max[:, None]) + deno = deno * re_scale + tl.sum(p, 1) + + offs_buf_v = ( + offs_kv_loc[:, None] * stride_buf_vbs + + cur_kv_head * stride_buf_vh + + offs_dv[None, :] + ) + v = tl.load( + V_Buffer + offs_buf_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0 + ) + p = p.to(v.dtype) + acc = acc * re_scale[:, None] + tl.dot(p, v) + + e_max = n_e_max + + # stage 2: compute the triangle part + + cur_block_m_end = ( + cur_seq_len_extend + if not IS_CAUSAL + else tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M) + ) + for start_n in range(0, cur_block_m_end, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + mask_n = (start_n + offs_n) < cur_block_m_end + + # load k in transposed way + offs_k = ( + (cur_seq_extend_start_idx + start_n + offs_n[None, :]) * stride_kbs + + cur_kv_head * stride_kh + + offs_d[:, None] + ) + k = tl.load( + K_Extend + offs_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0 + ) + + qk = tl.dot(q.to(k.dtype), k, out_dtype=tl.float32) + if BLOCK_DPE > 0: + offs_kpe = ( + (cur_seq_extend_start_idx + start_n + offs_n[None, :]) * stride_kbs + + cur_kv_head * stride_kh + + offs_dpe[:, None] + ) + kpe = tl.load( + K_Extend + offs_kpe, + mask=mask_n[None, :], + other=0.0, + ) + qk += tl.dot(qpe.to(kpe.dtype), kpe) + + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * _tanh(qk / logit_cap) + + if USE_CUSTOM_MASK: + custom_mask = tl.load( + mask_ptr + + cur_seq_mask_start_idx + + (cur_block_m * BLOCK_M + offs_m[:, None]) * cur_seq_len + + cur_seq_len_prefix + + start_n + + offs_n[None, :], + mask=(mask_m[:, None] & mask_n[None, :]), + other=0, + ) + custom_mask &= mask_m[:, None] & mask_n[None, :] + qk = tl.where(custom_mask, qk, float("-inf")) + elif IS_CAUSAL: + mask_causual = (cur_block_m * BLOCK_M + offs_m[:, None]) >= ( + start_n + offs_n[None, :] + ) + mask_causual &= mask_m[:, None] & mask_n[None, :] + qk = tl.where(mask_causual, qk, float("-inf")) + else: + mask_non_causal = mask_m[:, None] & mask_n[None, :] + qk = tl.where(mask_non_causal, qk, float("-inf")) + + n_e_max = tl.maximum(tl.max(qk, 1), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max[:, None]) + deno = deno * re_scale + tl.sum(p, 1) + + offs_v = ( + (cur_seq_extend_start_idx + start_n + offs_n[:, None]) * stride_vbs + + cur_kv_head * stride_vh + + offs_dv[None, :] + ) + v = tl.load( + V_Extend + offs_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0 + ) + p = p.to(v.dtype) + acc = acc * re_scale[:, None] + tl.dot(p, v) + + e_max = n_e_max + + offs_o = ( + (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None]) + * stride_obs + + cur_head * stride_oh + + offs_dv[None, :] + ) + if STORE_TRANSPOSE: + tl.store( + O_Extend + offs_o.T, + (acc / deno[:, None]).T, + mask=(mask_m[:, None] & mask_dv[None, :]).T, + ) + else: + tl.store( + O_Extend + offs_o, + acc / deno[:, None], + mask=mask_m[:, None] & mask_dv[None, :], + ) + + +def create_tuple(k): + if k[0] != '(' and k[-1] != ')': + return k + + s = k[1:-1] + entries = s.split(", ") + ret = [] + for e in entries: + if e[0] == "'" or e[0] == '"': + ret.append(e[1:-1]) + else: + ret.append(eval(e)) + ret_t = tuple(ret) + return ret_t + + +def _load_config(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/{dev}-EXTEND_ATTENTION-FP16.json" + with open(fpath, "r") as file: + data = json.load(file) + res = {} + res['config'] = data['config'] + res['path'] = data['path'] + res['key'] = list(data['config'].keys()) + res['keys'] = [create_tuple(k) for k in res['key']] + return res + + +global_config = _load_config() + +default_config = { + "BLOCK_M": 32, + "BLOCK_N": 32, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2, + "num_warps": 4, + "num_stages": 1 +} + + +@functools.lru_cache(maxsize=1024) +def _get_config(kv_group_num, Lq, Lv, use_custom_mask, is_causal): + idx = -1 + for i, keys in enumerate(global_config['keys']): + if keys[0] == kv_group_num and keys[1] == Lq and keys[2] == Lv \ + and keys[3] == use_custom_mask and keys[4] == is_causal: + idx = i + break + + if idx < 0: + print("WARNING: optimal config not found, just use default config") + return default_config, None + else: + key = global_config['key'][idx] + return global_config['config'][key], global_config['path'][key] + + +def has_kernel_cache(path): + return False if not path or not os.path.isdir(f'{cache_knob.dir}/{path}') else True + + +def to_dtype(torch_dtype): + if torch_dtype == torch.float32: + return 'fp32' + elif torch_dtype == torch.float16: + return 'fp16' + elif torch_dtype == torch.bfloat16: + return 'bf16' + elif torch_dtype == torch.int32: + return 'i32' + else: + return str(torch_dtype) + + +def extend_attention_fwd( + q_extend, + k_extend, + v_extend, + o_extend, + k_buffer, + v_buffer, + qo_indptr, + kv_indptr, + kv_indices, + custom_mask, + is_causal, + mask_indptr, + max_len_extend, + sm_scale=None, + logit_cap=0.0, + skip_prefix_custom_mask=True, + config: Optional[dict[str, any]] = None, +): + """ + q_extend, k_extend, v_extend, o_extend: contiguous tensors + + k_buffer, v_buffer: (prefix + extend) tensors in mem_manager + """ + Lq, Lv = ( + q_extend.shape[-1], + v_extend.shape[-1], + ) + + if Lq == 576: + BLOCK_DMODEL = 512 + BLOCK_DPE = 64 + elif Lq == 288: + BLOCK_DMODEL = 256 + BLOCK_DPE = 32 + elif Lq == 192: + BLOCK_DMODEL = 128 + BLOCK_DPE = 64 + else: + BLOCK_DMODEL = triton.next_power_of_2(Lq) + BLOCK_DPE = 0 + BLOCK_DV = triton.next_power_of_2(Lv) + + # BLOCK_M, BLOCK_N = (64, 64) + # num_warps = 4 + + sm_scale = sm_scale or 1.0 / (Lq**0.5) + batch_size, head_num = qo_indptr.shape[0] - 1, q_extend.shape[1] + kv_group_num = q_extend.shape[1] // k_extend.shape[1] + + USE_CUSTOM_MASK = custom_mask is not None + # Skip custom mask for prefix part + SKIP_PREFIX_CUSTOM_MASK = skip_prefix_custom_mask + + if not USE_CUSTOM_MASK: + custom_mask = torch.tensor([0], dtype=torch.bool, device=q_extend.device) + mask_indptr = torch.tensor([0], dtype=torch.int32, device=q_extend.device) + + if config is None: + if q_extend.dtype == torch.float16: + keys = [kv_group_num, Lq, Lv, USE_CUSTOM_MASK, is_causal] + config, path = _get_config(*keys) + else: + config, path = default_config, None + assert config is not None, "ERROR: optimal config not found" + + grid = (batch_size, head_num, triton.cdiv(max_len_extend, config["BLOCK_M"])) + # num_stages = 1 + + # extra_kargs = {} + + # extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2} + + fn = _fwd_kernel[grid] if not has_kernel_cache(path) \ + else functools.partial(triton.utils.run_saved_kernel, + _fwd_kernel, path, grid=grid) + + fn( + q_extend, + k_extend, + v_extend, + o_extend, + k_buffer, + v_buffer, + qo_indptr, + kv_indptr, + kv_indices, + custom_mask, + mask_indptr, + sm_scale, + kv_group_num, + q_extend.stride(0), + q_extend.stride(1), + k_extend.stride(0), + k_extend.stride(1), + v_extend.stride(0), + v_extend.stride(1), + o_extend.stride(0), + o_extend.stride(1), + k_buffer.stride(0), + k_buffer.stride(1), + v_buffer.stride(0), + v_buffer.stride(1), + logit_cap=logit_cap, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DPE=BLOCK_DPE, + BLOCK_DV=BLOCK_DV, + # BLOCK_M=BLOCK_M, + # BLOCK_N=BLOCK_N, + Lq=Lq, + Lv=Lv, + USE_CUSTOM_MASK=USE_CUSTOM_MASK, + IS_CAUSAL=is_causal, + SKIP_PREFIX_CUSTOM_MASK=SKIP_PREFIX_CUSTOM_MASK, + STORE_TRANSPOSE=True, + # num_warps=num_warps, + # num_stages=num_stages, + **config, + ) + + +def redundant_attention( + q_extend, + o_extend, + k_buffer, + v_buffer, + b_req_idx, + b_start_loc, + b_seq_len, + b_seq_len_prefix, + max_len_in_batch, +): + total_token_num = k_buffer.shape[0] + B, H_Q, D = b_req_idx.shape[0], q_extend.shape[-2], q_extend.shape[-1] + q_buffer = torch.empty( + (total_token_num, H_Q, D), dtype=q_extend.dtype, device=q_extend.device + ) + + pt = 0 + for i in range(B): + cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i] + pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i] + q_buffer[pl:pr] = q_extend[pt : pt + cur_seq_len_extend] + pt += cur_seq_len_extend + + o_buffer = torch.empty_like(q_buffer) + context_attention_fwd( + q_buffer, k_buffer, v_buffer, o_buffer, b_start_loc, b_seq_len, max_len_in_batch + ) + + pt = 0 + for i in range(B): + cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i] + pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i] + o_extend[pt : pt + cur_seq_len_extend] = o_buffer[pl:pr] + pt += cur_seq_len_extend diff --git a/aiter/ops/triton/flash_attention_forward.py b/aiter/ops/triton/flash_attention_forward.py new file mode 100644 index 0000000000000000000000000000000000000000..5bf922326c2b98987d93a7b17c3f048ff454fcfb --- /dev/null +++ b/aiter/ops/triton/flash_attention_forward.py @@ -0,0 +1,1182 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Fused Attention +=============== + +This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao +(https://tridao.me/publications/flash2/flash2.pdf) + +Features supported: + +1) Fwd with causal masking +2) Any sequence lengths without padding (currently fwd kernel only) +3) Support for different sequence lengths for q and k +4) Nested tensor API currently does not support dropout or bias. + +Not currently supported: + +1) Non power of two head dims + +""" + +import torch + +import triton +import triton.language as tl + +torch_dtype: tl.constexpr = torch.float16 + + +def on_gfx1x(): + """Check if running on GFX1x architecture (simplified implementation)""" + # For now, return False to use CDNA configs + # In a real implementation, this would check the actual GPU architecture + return False + + +@triton.jit +def cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def max_fn(x, y): + return tl.math.max(x, y) + + +@triton.jit +def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): + ms = tl.arange(0, m) + ns = tl.arange(0, n) + return philox_offset + ms[:, None] * stride + ns[None, :] + + +@triton.jit +def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, + stride).to(tl.uint32) + # TODO: use tl.randint for better performance + return tl.rand(philox_seed, rng_offsets) + + +@triton.jit +def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, + stride) + rng_keep = rng_output > dropout_p + return rng_keep + + +@triton.jit +def load_fn(block_ptr, first, second, pad): + if first and second: + tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad) + elif first: + tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad) + elif second: + tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad) + else: + tensor = tl.load(block_ptr) + return tensor + +@triton.jit +def _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K, + V, + k_offset, + v_offset, + start_m, + actual_seqlen_k, + stride_kk, + stride_kn, + stride_vk, + stride_vn, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + bias_ptr, + n_extra_tokens, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + OFFS_M: tl.constexpr, + OFFS_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + MASK_STEPS: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + PADDED_HEAD: tl.constexpr, + USE_FP8: tl.constexpr, + USE_MLS: tl.constexpr, + ACTUAL_BLOCK_DMODEL: tl.constexpr, + qk_scale, + p_descale, + n_full_blocks, +): + if not USE_MLS: + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(ACTUAL_BLOCK_DMODEL, actual_seqlen_k), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_DMODEL, BLOCK_N), + order=(0, 1), + ) + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(actual_seqlen_k, ACTUAL_BLOCK_DMODEL), + strides=(stride_vk, stride_vn), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_DMODEL), + order=(1, 0), + ) + if MASK_STEPS: + K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0)) + + # loop over k, v, and update accumulator + for start_n in range(block_min, block_max, BLOCK_N): + # For padded blocks, we will overrun the tensor size if + # we load all BLOCK_N. For others, the blocks are all within range. + if USE_MLS: + k_mls = tl.matrix_load(K + k_offset, + shape=[ACTUAL_BLOCK_DMODEL, actual_seqlen_k], + strides=[stride_kk, stride_kn], + block_shape=[BLOCK_DMODEL, BLOCK_N], + offsets=[0, start_n], + ) + kmask_d = tl.arange(0, BLOCK_DMODEL) < ACTUAL_BLOCK_DMODEL + kmask_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N) < actual_seqlen_k + kmask = kmask_d[:, None] & kmask_n[None, :] + #k = tl.where(kmask, k_mls, float(0)) + k = k_mls + else: + k = load_fn( + K_block_ptr, + PADDED_HEAD, + MASK_STEPS and (n_extra_tokens != 0), + "zero", + ) + if PRE_LOAD_V: + if USE_MLS: + v_mls = tl.matrix_load(V + v_offset, + shape=[actual_seqlen_k, ACTUAL_BLOCK_DMODEL], + strides=[stride_vk, stride_vn], + block_shape=[BLOCK_N, BLOCK_DMODEL], + offsets=[start_n, 0], + ) + vmask_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N) < actual_seqlen_k + vmask_d = tl.arange(0, BLOCK_DMODEL) < ACTUAL_BLOCK_DMODEL + vmask = vmask_n[:, None] & vmask_d[None, :] + #v = tl.where(vmask, v_mls, float(0)) + v = v_mls + else: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + # We start from end of seqlen_k so only the first iteration would need + # to be checked for padding if it is not a multiple of block_n + # TODO: This can be optimized to only be true for the padded block. + if MASK_STEPS: # noqa: SIM102 + # If this is the last block / iteration, we want to + # mask if the sequence length is not a multiple of block size + # a solution is to always do BLOCK_M // BLOCK_N + 1 steps + # if not is_modulo_mn. last step might get wasted but that is okay. + # check if this masking works for that case. + if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0): + boundary_m = tl.full([BLOCK_M], + actual_seqlen_k, + dtype=tl.int32) + size_n = start_n + OFFS_N[None, :] + mask = size_n < boundary_m[:, None] + qk = tl.where(mask, qk, float("-inf")) + if IS_CAUSAL: + causal_boundary = start_n + offs_n_causal + causal_mask = OFFS_M[:, None] >= causal_boundary[None, :] + qk = tl.where(causal_mask, qk, float("-inf")) + # -- compute qk ---- + qk += tl.dot(q, k) + if USE_FP8: + qk *= qk_scale + if bias_ptr is not None: + bias = load_fn(bias_ptr, False, MASK_STEPS + and (n_extra_tokens != 0), "zero") + # While bias is added after multiplying qk with sm_scale, our + # optimization to use 2^x instead of e^x results in an additional + # scale factor of log2(e) which we must also multiply the bias with. + qk += bias * 1.44269504089 + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk = qk - m_ij[:, None] + p = tl.math.exp2(qk) + + # CAVEAT: Must update l_ij before applying dropout + l_ij = tl.sum(p, 1) + if ENABLE_DROPOUT: + philox_offset = (batch_philox_offset + + start_m * BLOCK_M * actual_seqlen_k + start_n - + BLOCK_N) + keep = dropout_mask( + philox_seed, + philox_offset, + dropout_p, + BLOCK_M, + BLOCK_N, + actual_seqlen_k, + ) + if RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + tl.where(keep, p, + -p).to(encoded_softmax_block_ptr.type.element_ty), + ) + p = tl.where(keep, p, 0.0) + elif RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + p.to(encoded_softmax_block_ptr.type.element_ty), + ) + # -- update output accumulator -- + alpha = tl.math.exp2(m_i - m_ij) + acc = acc * alpha[:, None] + if not PRE_LOAD_V: + if USE_MLS: + v_mls = tl.matrix_load(V + v_offset, + shape=[actual_seqlen_k, ACTUAL_BLOCK_DMODEL], + strides=[stride_vk, stride_vn], + block_shape=[BLOCK_N, BLOCK_DMODEL], + #boundary_check=(0, 1) if PADDED_HEAD else (0,), + offsets=[start_n, 0], + ) + vmask_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N) < actual_seqlen_k + vmask_d = tl.arange(0, BLOCK_DMODEL) < ACTUAL_BLOCK_DMODEL + vmask = vmask_n[:, None] & vmask_d[None, :] + #v = tl.where(vmask, v_mls, float(0)) + v = v_mls + else: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) + # -- update m_i and l_i + l_i = l_i * alpha + l_ij + # update m_i and l_i + m_i = m_ij + + if USE_FP8: + p *= p_descale + + acc += tl.dot(p.to(v.type.element_ty), v) + + if not USE_MLS: + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, + (0, BLOCK_N)) + return acc, l_i, m_i + + +def get_cdna_autotune_configs(): + return [ + triton.Config( + { + 'BLOCK_M': 256, + 'BLOCK_N': 64, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False, + 'USE_MLS': False, + }, + num_stages=1, + num_warps=8), + triton.Config( + { + 'BLOCK_M': 128, + 'BLOCK_N': 128, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False, + 'USE_MLS': False + }, + num_stages=1, + num_warps=4), + triton.Config( + { + 'BLOCK_M': 256, + 'BLOCK_N': 128, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False, + 'USE_MLS': False + }, + num_stages=1, + num_warps=8), + triton.Config( + { + 'BLOCK_M': 128, + 'BLOCK_N': 64, + 'waves_per_eu': 1, + 'PRE_LOAD_V': False, + 'USE_MLS': False + }, + num_stages=1, + num_warps=4), + triton.Config( + { + 'BLOCK_M': 128, + 'BLOCK_N': 64, + 'waves_per_eu': 3, + 'PRE_LOAD_V': True, + 'USE_MLS': False + }, + num_stages=1, + num_warps=4), + triton.Config( + { + 'BLOCK_M': 128, + 'BLOCK_N': 64, + 'waves_per_eu': 3, + 'PRE_LOAD_V': False, + 'USE_MLS': False + }, + num_stages=1, + num_warps=4), + triton.Config( + { + 'BLOCK_M': 64, + 'BLOCK_N': 64, + 'waves_per_eu': 4, + 'PRE_LOAD_V': False, + 'USE_MLS': False + }, + num_stages=1, + num_warps=8), + triton.Config( + { + 'BLOCK_M': 32, + 'BLOCK_N': 32, + 'waves_per_eu': 4, + 'PRE_LOAD_V': False, + 'USE_MLS': False + }, + num_stages=1, + num_warps=8), + # TODO: This config fails with head_size not pow2 with data mismatches. + # triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1, + # 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + + # Fails in AccelerateAMDMatmul (Triton) assert when using FP8: + # triton.Config( + # { + # "BLOCK_M": 16, + # "BLOCK_N": 16, + # "waves_per_eu": 1, + # "PRE_LOAD_V": False, + # }, + # num_stages=1, + # num_warps=4, + # ), + ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8'] + +def get_rdna_autotune_configs(): + return [ + triton.Config( + { + 'BLOCK_M': 32, + 'BLOCK_N': 32, + 'waves_per_eu': 4, + 'PRE_LOAD_V': False, + 'USE_MLS': True, + }, + num_stages=1, + num_warps=2), + triton.Config( + { + 'BLOCK_M': 32, + 'BLOCK_N': 32, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False, + 'USE_MLS': True, + }, + num_stages=1, + num_warps=2), + triton.Config( + { + 'BLOCK_M': 32, + 'BLOCK_N': 16, + 'waves_per_eu': 4, + 'PRE_LOAD_V': False, + 'USE_MLS': True, + }, + num_stages=1, + num_warps=2), + triton.Config( + { + 'BLOCK_M': 32, + 'BLOCK_N': 16, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False, + 'USE_MLS': True, + }, + num_stages=1, + num_warps=2), + # Fails in AccelerateAMDMatmul (Triton) assert when using FP8: + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 4, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 2, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + # # Fall-back config. + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 1, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8', "USE_MLS"] + + +def get_autotune_configs(): + #if on_gfx1x(): + # return get_rdna_autotune_configs() + #else: + return get_cdna_autotune_configs() + + +autotune_configs, autotune_keys = get_autotune_configs() +float8_info = torch.finfo(torch.float8_e4m3fn) + +def prune_configs(configs, nargs, **kwargs): + def _prune(config): + _config = config.all_kwargs() + if _config["USE_MLS"] and kwargs["USE_FP8"] and kwargs["BLOCK_DMODEL"] == 32: # unsupported by mls + return True + if _config["USE_MLS"] and ( + (kwargs["USE_FP8"] and kwargs["ACTUAL_BLOCK_DMODEL"] % 4 != 0) or + (not kwargs["USE_FP8"] and kwargs["ACTUAL_BLOCK_DMODEL"] % 2 != 0) + ): # mls instr start address must be aligned to 4 bytes (1 DWORD) + return True + return False + return [c for c in configs if not _prune(c)] + +''' +@triton.autotune( + configs=autotune_configs, + key=autotune_keys, + prune_configs_by={"early_config_prune": prune_configs} +) +''' +''' +@triton.utils.hcutune( + configs=autotune_configs, + always_tuning=True, + key=['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8', 'USE_MLS'] +) +''' +@triton.heuristics( + values={ + "PADDED_HEAD": lambda args: args["ACTUAL_BLOCK_DMODEL"] != args["BLOCK_DMODEL"], + } +) +@triton.jit +def attn_fwd( + Q, + K, + V, + bias, + sm_scale, + q_scale, + k_scale, + v_scale, + p_scale, + p_descale, + o_descale, + L, + Out, + stride_qz: tl.int64, + stride_qh: tl.int64, + stride_qm: tl.int64, + stride_qk: tl.int64, + stride_kz: tl.int64, + stride_kh: tl.int64, + stride_kn: tl.int64, + stride_kk: tl.int64, + stride_vz: tl.int64, + stride_vh: tl.int64, + stride_vk: tl.int64, + stride_vn: tl.int64, + stride_oz: tl.int64, + stride_oh: tl.int64, + stride_om: tl.int64, + stride_on: tl.int64, + stride_bz: tl.int64, + stride_bh: tl.int64, + stride_bm: tl.int64, + stride_bn: tl.int64, + cu_seqlens_q, + cu_seqlens_k, + dropout_p, + philox_seed, + philox_offset_base, + encoded_softmax, + HQ: tl.constexpr, + HK: tl.constexpr, + ACTUAL_BLOCK_DMODEL: tl.constexpr, + MAX_SEQLENS_Q: tl.constexpr, + MAX_SEQLENS_K: tl.constexpr, + VARLEN: tl.constexpr, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + USE_FP8: tl.constexpr, + USE_FP8_OUT: tl.constexpr, + BLOCK_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + BIAS_TYPE: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + USE_MLS: tl.constexpr, + PADDED_HEAD: tl.constexpr, + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, +): + tl.assume(stride_qz > 0) + tl.assume(stride_qh > 0) + tl.assume(stride_qm > 0) + tl.assume(stride_qk > 0) + tl.assume(stride_kz > 0) + tl.assume(stride_kh > 0) + tl.assume(stride_kn > 0) + tl.assume(stride_kk > 0) + tl.assume(stride_vz > 0) + tl.assume(stride_vh > 0) + tl.assume(stride_vk > 0) + tl.assume(stride_vn > 0) + tl.assume(stride_oz > 0) + tl.assume(stride_oh > 0) + tl.assume(stride_om > 0) + tl.assume(stride_on > 0) + tl.assume(stride_bz > 0) + tl.assume(stride_bh > 0) + tl.assume(stride_bm > 0) + tl.assume(stride_bn > 0) + + start_m = tl.program_id(0) + off_h_q = tl.program_id(1) + off_z = tl.program_id(2) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + if VARLEN: + cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z) + cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1) + seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start + # We have a one-size-fits-all grid in id(0). Some seqlens might be too + # small for all start_m so for those we return early. + if start_m * BLOCK_M > seqlen_q: + return + cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z) + cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1) + seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start + else: + cu_seqlens_q_start = 0 + cu_seqlens_k_start = 0 + seqlen_q = MAX_SEQLENS_Q + seqlen_k = MAX_SEQLENS_K + + # Now we compute whether we need to exit early due to causal masking. + # This is because for seqlen_q > seqlen_k, M rows of the attn scores + # are completely masked, resulting in 0s written to the output, and + # inf written to LSE. We don't need to do any GEMMs in this case. + # This block of code determines what N is, and if this WG is operating + # on those M rows. + n_blocks = cdiv_fn(seqlen_k, BLOCK_N) + if IS_CAUSAL: + # If seqlen_q == seqlen_k, the attn scores are a square matrix. + # If seqlen_q != seqlen_k, attn scores are rectangular which means + # the causal mask boundary is bottom right aligned, and ends at either + # the top edge (seqlen_q < seqlen_k) or left edge. + # This captures the decrease in n_blocks if we have a rectangular attn + # matrix + n_blocks_seqlen = cdiv_fn( + (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N) + # This is what adjusts the block_max for the current WG, only + # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks + n_blocks = min(n_blocks, n_blocks_seqlen) + # If we have no blocks after adjusting for seqlen deltas, this WG is + # part of the blocks that are all 0. We exit early. + if n_blocks <= 0: + o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om + + off_h_q * stride_oh) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty) + # We still need to write 0s to the result + # tl.store(O_block_ptr, + # acc.to(Out.type.element_ty), boundary_check=(0,1)) + # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + # + offs_m + # We store inf to LSE, not -inf because in the bwd pass, + # we subtract this + # from qk which makes it -inf, such that exp(qk - inf) = 0 + # for these masked blocks. + # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) + # tl.store(l_ptrs, l) + # TODO: Should dropout and return encoded softmax be handled here? + return + + # If MQA / GQA, set the K and V head offsets appropriately. + GROUP_SIZE: tl.constexpr = HQ // HK + off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q + + n_extra_tokens = 0 + if seqlen_k < BLOCK_N: + n_extra_tokens = BLOCK_N - seqlen_k + elif seqlen_k % BLOCK_N: + n_extra_tokens = seqlen_k % BLOCK_N + + # Compute pointers for all the tensors used in this kernel. + q_offset = (off_z * stride_qz + off_h_q * stride_qh + + cu_seqlens_q_start * stride_qm) + if not USE_MLS: + Q_block_ptr = tl.make_block_ptr( + base=Q + q_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_qm, stride_qk), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + k_offset = (off_z * stride_kz + off_h_k * stride_kh + + cu_seqlens_k_start * stride_kn) + v_offset = (off_z * stride_vz + off_h_k * stride_vh + + cu_seqlens_k_start * stride_vk) + if BIAS_TYPE != 0: + bias_ptr = tl.make_block_ptr( + base=bias + off_h_q * stride_bh, + shape=(seqlen_q, seqlen_k), + strides=(stride_bm, stride_bn), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + bias_ptr = None + if ENABLE_DROPOUT: + batch_philox_offset = philox_offset_base \ + + (off_z * HQ + off_h_q) \ + * seqlen_q * seqlen_k + else: + batch_philox_offset = 0 + # We can ask to return the dropout mask without actually doing any dropout. + # In this case, we return an invalid pointer so indicate the mask is not i + # valid. + # TODO: Fix encoded softmax. It currently uses just h_q in the base offset. + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.make_block_ptr( + base=encoded_softmax + off_h_q * seqlen_q * seqlen_k, + shape=(seqlen_q, seqlen_k), + strides=(seqlen_k, 1), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + encoded_softmax_block_ptr = 0 + # initialize pointer to m and l + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # scale sm_scale by log_2(e) and use 2^x in the loop as we do not + # have native e^x support in HW. + qk_scale = sm_scale * 1.44269504089 + # Q is loaded once at the beginning and shared by all N blocks. + if USE_MLS: + q_mls = tl.matrix_load(Q + q_offset, + shape=[seqlen_q, ACTUAL_BLOCK_DMODEL], + strides=[stride_qm, stride_qk], + block_shape=[BLOCK_M, BLOCK_DMODEL], + offsets=[start_m * BLOCK_M, 0], + #boundary_check=(0, 1) if PADDED_HEAD else (0,), + #boundary_check=(0, 1) + ) + + qmask_m = tl.arange(0, BLOCK_M) < BLOCK_M + qmask_d = tl.arange(0, BLOCK_DMODEL) < BLOCK_DMODEL + qmask = qmask_m[:, None] & qmask_d[None, :] + q = tl.where(qmask, q_mls, float(0)) + + #q = q_mls + else: + q = load_fn(Q_block_ptr, True, PADDED_HEAD, "zero") + if not USE_FP8: + q = (q * qk_scale).to(Q.type.element_ty) + acc_scale = 1.0 + else: + qk_scale *= q_scale * k_scale + acc_scale = p_scale * v_scale + + # Here we compute how many full and masked blocks we have. + padded_block_k = n_extra_tokens != 0 + is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) + if IS_CAUSAL: + # There are always at least BLOCK_M // BLOCK_N masked blocks. + # Additionally there might be one more due to dissimilar seqlens. + masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) + else: + # Padding on Q does not need to be masked in the FA loop. + masked_blocks = padded_block_k + # if IS_CAUSAL, not is_modulo_mn does not always result in an additional + # block. In this case we might exceed n_blocks so pick the min. + masked_blocks = min(masked_blocks, n_blocks) + n_full_blocks = n_blocks - masked_blocks + block_min = 0 + block_max = n_blocks * BLOCK_N + # Compute for full blocks. Here we set causal to false regardless of its + # value because there is no masking. Similarly we do not need padding. + if n_full_blocks > 0: + block_max = (n_blocks - masked_blocks) * BLOCK_N + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K, + V, + k_offset, + v_offset, + start_m, + seqlen_k, + stride_kk, + stride_kn, + stride_vk, + stride_vn, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + # _, _, offs_n_causal, masked_blocks, _ + block_min, + block_max, + 0, + 0, + bias_ptr, + # N_EXTRA_TOKENS, + 0, + # IS_CAUSAL, .... + False, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + False, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + PADDED_HEAD, + USE_FP8, + USE_MLS, + ACTUAL_BLOCK_DMODEL, + qk_scale, + p_descale, + n_full_blocks + ) + block_min = block_max + block_max = n_blocks * BLOCK_N + + tl.debug_barrier() + # Remaining blocks, if any, are full / not masked. + if masked_blocks > 0: + offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0 + #K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N)) + #V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, + (0, n_full_blocks)) + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K, + V, + k_offset, + v_offset, + start_m, + seqlen_k, + stride_kk, + stride_kn, + stride_vk, + stride_vn, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + bias_ptr, + n_extra_tokens, + IS_CAUSAL, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + True, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + PADDED_HEAD, + USE_FP8, + USE_MLS, + ACTUAL_BLOCK_DMODEL, + qk_scale, + p_descale, + n_full_blocks + ) + # epilogue + + if USE_FP8: + acc *= acc_scale + acc = acc / l_i[:, None] + if ENABLE_DROPOUT: + acc = acc / (1 - dropout_p) + # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M, + # then we have one block with a row of all NaNs which come from computing + # softmax over a row of all -infs (-inf - inf = NaN). We check for that here + # and store 0s where there are NaNs as these rows should've been zeroed out. + end_m_idx = (start_m + 1) * BLOCK_M + start_m_idx = start_m * BLOCK_M + causal_start_idx = seqlen_q - seqlen_k + if USE_FP8_OUT: + acc *= o_descale + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + acc = acc.to(Out.type.element_ty) + if IS_CAUSAL: # noqa: SIM102 + if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx: + out_mask_boundary = tl.full((BLOCK_DMODEL, ), + causal_start_idx, + dtype=tl.int32) + mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) + out_ptrs_mask = (mask_m_offsets[:, None] + >= out_mask_boundary[None, :]) + z = tl.zeros((1, ), tl.float32) + acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) + # write back LSE + # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + # If seqlen_q not multiple of BLOCK_M, we need to mask out the last + # few rows. This is only true for the last M block. For others, + # overflow_size will be -ve + # overflow_size = end_m_idx - seqlen_q + # if overflow_size > 0: + # boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + # # This is a > check because mask being 0 blocks the store. + # l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) + # tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) + # else: + # tl.store(l_ptrs, m_i + tl.math.log2(l_i)) + + # write back O + o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om + + off_h_q * stride_oh) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + # Need boundary check on this to make sure the padding from the + # Q and KV tensors in both dims are not part of what we store back. + # TODO: Do the boundary check optionally. + tl.store(O_block_ptr, acc, boundary_check=(0, 1)) + + +def check_args( + q, + k, + v, + o, + varlen=True, + max_seqlens=None, + cu_seqlens_q=None, + cu_seqlens_k=None, +): + assert q.dim() == k.dim() and q.dim() == v.dim() + if varlen: + assert q.dim() == 3 + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + assert cu_seqlens_q is not None + assert cu_seqlens_k is not None + assert len(cu_seqlens_q) == len(cu_seqlens_k) + else: + assert q.dim() == 4 + batch, nheads_q, seqlen_q, head_size = q.shape + _, nheads_k, seqlen_k, _ = k.shape + assert max_seqlens > 0 + assert k.shape == v.shape + assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] + # TODO: Change assert if we support qkl f8 and v f16 + assert q.dtype == k.dtype and q.dtype == v.dtype + assert head_size <= 256 + assert o.shape == q.shape + assert (nheads_q % nheads_k) == 0 + + +class _attention(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + q, + k, + v, + o, + cu_seqlens_q, + cu_seqlens_k, + max_seqlens_q, + max_seqlens_k, + causal=False, + sm_scale=1.0, + bias=None, + fp8_scales=None, + fp8_out_scale=None, + ): + if fp8_scales is not None: + use_fp8 = True + (q_scale, k_scale, v_scale, p_scale) = fp8_scales + float8 = torch.float8_e4m3fn + + def check_and_convert(t, scale): + if t.dtype != float8: + descale = 1.0 / scale + ts = (t * descale).clamp(min=float8_info.min, + max=float8_info.max) + return ts.to(float8) + else: + return t + + q = check_and_convert(q, q_scale) + k = check_and_convert(k, k_scale) + v = check_and_convert(v, v_scale) + else: + use_fp8 = False + q_scale = k_scale = v_scale = p_scale = 1.0 + + if o is None: + o = torch.empty_like(q, dtype=v.dtype) + + check_args( + q, + k, + v, + o, + varlen=True, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + ) + if True: # varlen + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + batch = len(cu_seqlens_q) - 1 + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + else: + batch, seqlen_q, nheads_q, head_size = q.shape + _, seqlen_k, nheads_k, _ = k.shape + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) + + # Get closest power of 2 over or equal to 32. + unpadded_head_dims = {32, 64, 128, 256} + if head_size not in unpadded_head_dims: + padded_d_model = None + for i in unpadded_head_dims: + if i > head_size: + padded_d_model = i + break + assert padded_d_model is not None + else: + padded_d_model = head_size + + grid = lambda META: ( + triton.cdiv(max_seqlens_q, META["BLOCK_M"]), + nheads_q, + batch, + ) + + encoded_softmax = None + + # Seed the RNG so we get reproducible results for testing. + philox_seed = 0x1BF52 + philox_offset = 0x1D4B42 + + if bias is not None: + bias_strides = ( + bias.stride(0), + bias.stride(1), + bias.stride(2), + bias.stride(3), + ) + else: + bias_strides = (0, 0, 0, 0) + + p_descale = 1.0 / p_scale + o_descale = 1.0 / fp8_out_scale.item( + ) if fp8_out_scale is not None else 1.0 + + arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q + arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k + + if not use_fp8: + if padded_d_model <= 128: + config = { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 3, + "PRE_LOAD_V": False, + "USE_MLS": False, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + } + else: + config = { + "BLOCK_M": 32, + "BLOCK_N": 32, + "waves_per_eu": 4, + "PRE_LOAD_V": False, + "USE_MLS": False, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + } + else: + if padded_d_model <= 128: + config = { + "BLOCK_M": 128, + "BLOCK_N": 128, + "waves_per_eu": 2, + "PRE_LOAD_V": False, + "USE_MLS": False, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + } + else: + config = { + "BLOCK_M": 32, + "BLOCK_N": 32, + "waves_per_eu": 4, + "PRE_LOAD_V": False, + "USE_MLS": False, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 1, + } + + attn_fwd[grid]( + q, + k, + v, + bias, + sm_scale, + q_scale, + k_scale, + v_scale, + p_scale, + p_descale, + o_descale, + None, + o, + *q_strides, + *k_strides, + *v_strides, + *o_strides, + *bias_strides, + cu_seqlens_q, + cu_seqlens_k, + dropout_p=0.0, + philox_seed=philox_seed, + philox_offset_base=philox_offset, + encoded_softmax=encoded_softmax, + HQ=nheads_q, + HK=nheads_k, + ACTUAL_BLOCK_DMODEL=head_size, + MAX_SEQLENS_Q=arg_max_seqlens_q, + MAX_SEQLENS_K=arg_max_seqlens_k, + IS_CAUSAL=causal, + VARLEN=True, + BLOCK_DMODEL=padded_d_model, + BIAS_TYPE=0 if bias is None else 1, + ENABLE_DROPOUT=False, + RETURN_ENCODED_SOFTMAX=False, + USE_FP8=use_fp8, + USE_FP8_OUT=fp8_out_scale is not None, + **config, + ) + + ctx.grid = grid + ctx.sm_scale = sm_scale + ctx.BLOCK_DMODEL = head_size + ctx.causal = causal + ctx.dropout_p = 0.0 + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.encoded_softmax = encoded_softmax + ctx.return_encoded_softmax = False + return o, encoded_softmax + + +triton_attention = _attention.apply \ No newline at end of file diff --git a/aiter/ops/triton/fused_moe.py b/aiter/ops/triton/fused_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..8d2f0f9ac484b29e26511e6d7cd7257a22f4b8a0 --- /dev/null +++ b/aiter/ops/triton/fused_moe.py @@ -0,0 +1,1198 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Fused MoE kernel.""" +import functools +import json +import os +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils.arch_info as arch_info + +from aiter.ops.triton.moe_op import fused_moe as invoke_fused_moe_kernel, support_mls +# from vllm import _custom_ops as ops +from aiter.ops.triton.utils.moe_config_utils import get_optimal_moe_config_func +from aiter import silu_and_mul, gelu_and_mul, moe_sum +from aiter import per_token_quant_hip, per_block_quant_wrapper +from aiter import sgl_moe_align_block_size as sgl_moe_align_block_size_aiter +from aiter import moe_align_block_size as moe_align_block_size_aiter +from aiter.jit.utils.torch_guard import torch_compile_guard +from aiter import dtypes,moe_sorting_fwd + +device_name = arch_info.get_device() + + +def get_moe_sum_config(M, top_k, N): + if M < 32: + return {"BLOCK_SIZE": 128, "num_warps": 1} + else: + return {"BLOCK_SIZE": 512, "num_warps": 4} + +# def generate_sum_configs(): +# configs = [] +# for block_n in [32, 64, 128, 256, 512, 1024, 2048, 4096]: +# for num_warps in [1, 2, 4]: +# for num_stages in [1, 2]: +# config = triton.Config({ +# 'BLOCK_SIZE': block_n, +# }, num_warps=num_warps, num_stages=num_stages) +# configs.append(config) +# return configs + +# @triton.autotune( +# key=['M', 'N', 'top_k','compute_type'], +# configs=generate_sum_configs(), +# # configs = [ +# # triton.Config({'BLOCK_SIZE': 64 }, num_warps=1), +# # triton.Config({'BLOCK_SIZE': 128 }, num_warps=1), +# # triton.Config({'BLOCK_SIZE': 256 }, num_warps=4), +# # triton.Config({'BLOCK_SIZE': 512 }, num_warps=4), +# # ], +# perf_debug=True, +# ) +@triton.heuristics({ + "n_dividable": lambda args: (args["N"] % args["BLOCK_SIZE"]) == 0, +}) +@triton.jit +def moe_sum_kernel( + output_ptr, # [M, N] + input_ptr, # [M, top_k, N] + M, + N: tl.constexpr, + top_k: tl.constexpr, + routed_scaling_factor, + BLOCK_SIZE: tl.constexpr, + stride_output_m, + stride_output_n, + stride_input_m, + stride_input_k, + stride_input_n, + compute_type: tl.constexpr, + n_dividable: tl.constexpr, +): + tl.assume(stride_output_m >= 0) + tl.assume(stride_output_n >= 0) + tl.assume(stride_input_m >= 0) + tl.assume(stride_input_k >= 0) + tl.assume(stride_input_n >= 0) + + num_pid_n = tl.cdiv(N, BLOCK_SIZE) + pid = tl.program_id(axis=0) + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask_n = offs_n < N + + acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + input_row_ptr = input_ptr + pid_m.to(tl.int64) * stride_input_m + for k in range(top_k): + input_ptrs = input_row_ptr + ( + k * stride_input_k + + offs_n * stride_input_n + ) + if n_dividable: + x = tl.load(input_ptrs) + else: + x = tl.load(input_ptrs, mask=mask_n, other=0.0) + acc += x.to(tl.float32) + + acc *= routed_scaling_factor + acc = acc.to(compute_type) + output_ptrs = output_ptr + (pid_m.to(tl.int64) * stride_output_m + offs_n * stride_output_n) + if n_dividable: + tl.store(output_ptrs, acc) + else: + tl.store(output_ptrs, acc, mask=mask_n) + + +def triton_moe_sum(input_tensor, + output_tensor, + routed_scaling_factor: float = 1.0): + """ + 1D tile version of moe_sum. + + Args: + input_tensor: [M, top_k, N] + output_tensor: [M, N] + """ + M, top_k, N = input_tensor.shape + + assert output_tensor.dtype == torch.float16 or \ + output_tensor.dtype == torch.bfloat16 or \ + output_tensor.dtype == torch.float32 + + if output_tensor.dtype == torch.float16: + compute_type = tl.float16 + elif output_tensor.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif output_tensor.dtype == torch.float32: + compute_type = tl.float32 + + assert input_tensor.is_contiguous() + assert output_tensor.is_contiguous() + assert input_tensor.shape[0] == output_tensor.shape[0] + assert input_tensor.shape[2] == output_tensor.shape[1] + + # 计算grid + config = get_moe_sum_config(M, top_k, N) + grid = (M * triton.cdiv(N, config["BLOCK_SIZE"]),) + # grid = lambda META: (M * triton.cdiv(N, META['BLOCK_SIZE']), ) + + moe_sum_kernel[grid]( + output_tensor, + input_tensor, + M, + N, + top_k, + routed_scaling_factor, + stride_output_m=output_tensor.stride(0), + stride_output_n=output_tensor.stride(1), + stride_input_m=input_tensor.stride(0), + stride_input_k=input_tensor.stride(1), + stride_input_n=input_tensor.stride(2), + compute_type=compute_type, + **config, + ) + + return output_tensor + + +# @triton.autotune( +# configs=[ +# triton.Config( +# {"BLOCK_SIZE_M": bm, "BLOCK_SIZE_N": bn}, +# num_warps=nw, +# ) +# for bm in (1, 4, 16, 32, 64, 128) +# for bn in (32, 128, 256, 512,) +# for nw in (1, 2, 4) +# ], +# key=["M", "N", "ACT"], +# perf_debug=True, +# ) +@triton.heuristics( + { + "M_DIV": lambda args: (args["M"] % args["BLOCK_SIZE_M"]) == 0, + "N_DIV": lambda args: (args["N"] % args["BLOCK_SIZE_N"]) == 0, + } +) +@triton.jit +def activation_and_mul_kernel( + out_ptr, + in_ptr, + M, + N, + stride_in0, + stride_in1, + stride_out0, + stride_out1, + ACT: tl.constexpr, + M_DIV: tl.constexpr, + N_DIV: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + tl.assume(M > 0) + tl.assume(N > 0) + tl.assume(stride_in0 >= 0) + tl.assume(stride_in1 >= 0) + tl.assume(stride_out0 >= 0) + tl.assume(stride_out1 >= 0) + + pid_m = tl.program_id(1) + pid_n = tl.program_id(0) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + mask_m = offs_m < M + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + + in_base = in_ptr + offs_m[:, None] * stride_in0 + offs_n[None, :] * stride_in1 + if M_DIV and N_DIV: + x0 = tl.load(in_base) + x1 = tl.load(in_base + N * stride_in1) + else: + x0 = tl.load(in_base, mask=mask, other=0.0) + x1 = tl.load(in_base + N * stride_in1, mask=mask, other=0.0) + + x0_f = x0.to(tl.float32) + x1_f = x1.to(tl.float32) + if ACT == 0: + act = x0_f * (1.0 / (1.0 + tl.exp(-x0_f))) + else: + act = x0_f * 0.5 * (1.0 + tl.erf(x0_f * 0.7071067811865476)) + y = act * x1_f + + out_ptrs = out_ptr + offs_m[:, None] * stride_out0 + offs_n[None, :] * stride_out1 + if M_DIV and N_DIV: + tl.store(out_ptrs, y) + else: + tl.store(out_ptrs, y, mask=mask) + +def get_triton_activation_and_mul_config(M, N): + if M <= 512: + return {"BLOCK_SIZE_M": 1, "BLOCK_SIZE_N": 256, "num_warps": 1} + return {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "num_warps": 4} + + +@triton.heuristics( + { + "M_DIV": lambda args: (args["M"] % args["BLOCK_SIZE_M"]) == 0, + "N_DIV": lambda args: (args["N"] % args["BLOCK_SIZE_N"]) == 0, + } +) +@triton.jit +def relu2_kernel( + out_ptr, + in_ptr, + M, + N, + stride_in0, + stride_in1, + stride_out0, + stride_out1, + M_DIV: tl.constexpr, + N_DIV: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + tl.assume(M > 0) + tl.assume(N > 0) + pid_m = tl.program_id(1) + pid_n = tl.program_id(0) + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + mask_m = offs_m < M + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + in_base = in_ptr + offs_m[:, None] * stride_in0 + offs_n[None, :] * stride_in1 + if M_DIV and N_DIV: + x = tl.load(in_base) + else: + x = tl.load(in_base, mask=mask, other=0.0) + xf = x.to(tl.float32) + y = tl.where(xf > 0.0, xf * xf, 0.0) + out_ptrs = out_ptr + offs_m[:, None] * stride_out0 + offs_n[None, :] * stride_out1 + if M_DIV and N_DIV: + tl.store(out_ptrs, y.to(x.dtype)) + else: + tl.store(out_ptrs, y.to(x.dtype), mask=mask) + + +def triton_relu2(out: torch.Tensor, inp: torch.Tensor) -> None: + """Elementwise ReLU² (no gate / no second-path mul). out.shape == inp.shape.""" + assert inp.shape == out.shape + assert inp.is_contiguous() and out.is_contiguous() + M, N = inp.shape + config = get_triton_activation_and_mul_config(M, N) + grid = ( + triton.cdiv(N, config["BLOCK_SIZE_N"]), + triton.cdiv(M, config["BLOCK_SIZE_M"]), + ) + relu2_kernel[grid]( + out, + inp, + M, + N, + inp.stride(0), + inp.stride(1), + out.stride(0), + out.stride(1), + **config, + ) + + +def triton_silu_and_mul(out: torch.Tensor, input: torch.Tensor) -> None: + assert input.shape[-1] % 2 == 0 + assert input.is_contiguous() + assert out.is_contiguous() + M = input.numel() // input.shape[-1] + N = input.shape[-1] // 2 + input_2d = input.view(M, input.shape[-1]) + out_2d = out.view(M, N) + + + # grid = lambda META: ( + # triton.cdiv(N, META["BLOCK_SIZE_N"]), + # triton.cdiv(M, META["BLOCK_SIZE_M"]), + # ) + + config = get_triton_activation_and_mul_config(M, N) + grid = ( + triton.cdiv(N, config["BLOCK_SIZE_N"]), + triton.cdiv(M, config["BLOCK_SIZE_M"]), + ) + + activation_and_mul_kernel[grid]( + out_2d, + input_2d, + M, + N, + input_2d.stride(0), + input_2d.stride(1), + out_2d.stride(0), + out_2d.stride(1), + ACT=0, + **config, + ) + + +def triton_gelu_and_mul(out: torch.Tensor, input: torch.Tensor) -> None: + assert input.shape[-1] % 2 == 0 + assert input.is_contiguous() + assert out.is_contiguous() + M = input.numel() // input.shape[-1] + N = input.shape[-1] // 2 + input_2d = input.view(M, input.shape[-1]) + out_2d = out.view(M, N) + + # grid = lambda META: ( + # triton.cdiv(M, META["BLOCK_SIZE_M"]), + # triton.cdiv(N, META["BLOCK_SIZE_N"]), + # ) + config = get_triton_activation_and_mul_config(M, N) + grid = (triton.cdiv(M, config['BLOCK_SIZE_M']) * triton.cdiv(N, config['BLOCK_SIZE_N']),) + activation_and_mul_kernel[grid]( + out_2d, + input_2d, + M, + N, + input_2d.stride(0), + input_2d.stride(1), + out_2d.stride(0), + out_2d.stride(1), + ACT=1, + **config, + ) + + +def ceil_div(a, b): + return (a + b - 1) // b + +@triton.jit +def moe_align_block_size_stage1( + topk_ids_ptr, + tokens_cnts_ptr, + num_experts: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + + start_idx = pid * tokens_per_thread + + off_c = (pid + 1) * num_experts + + for i in range(tokens_per_thread): + if start_idx + i < numel: + idx = tl.load(topk_ids_ptr + start_idx + i) + token_cnt = tl.load(tokens_cnts_ptr + off_c + idx) + tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1) + + +@triton.jit +def moe_align_block_size_stage2( + tokens_cnts_ptr, + num_experts: tl.constexpr, +): + pid = tl.program_id(0) + + last_cnt = 0 + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid) + last_cnt = last_cnt + token_cnt + tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt) + + +@triton.jit +def moe_align_block_size_stage3( + total_tokens_post_pad_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, +): + last_cumsum = 0 + off_cnt = num_experts * num_experts + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1) + last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size + tl.store(cumsum_ptr + i, last_cumsum) + tl.store(total_tokens_post_pad_ptr, last_cumsum) + + +@triton.jit +def moe_align_block_size_stage4( + topk_ids_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + start_idx = tl.load(cumsum_ptr + pid) + end_idx = tl.load(cumsum_ptr + pid + 1) + + for i in range(start_idx, end_idx, block_size): + tl.store(expert_ids_ptr + i // block_size, pid) + + start_idx = pid * tokens_per_thread + off_t = pid * num_experts + + for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, + numel)): + expert_id = tl.load(topk_ids_ptr + i) + token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id) + rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id) + tl.store(sorted_token_ids_ptr + rank_post_pad, i) + tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1) + + +# Triton implementation based on: +# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0 +def moe_align_block_size_triton( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + numel = topk_ids.numel() + grid = (num_experts, ) + tokens_cnts = torch.zeros((num_experts + 1, num_experts), + dtype=torch.int32, + device=topk_ids.device) + cumsum = torch.zeros((num_experts + 1, ), + dtype=torch.int32, + device=topk_ids.device) + tokens_per_thread = ceil_div(numel, num_experts) + + moe_align_block_size_stage1[grid]( + topk_ids, + tokens_cnts, + num_experts, + numel, + tokens_per_thread, + ) + moe_align_block_size_stage2[grid]( + tokens_cnts, + num_experts, + ) + moe_align_block_size_stage3[(1, )]( + num_tokens_post_pad, + tokens_cnts, + cumsum, + num_experts, + block_size, + ) + moe_align_block_size_stage4[grid]( + topk_ids, + sorted_token_ids, + expert_ids, + tokens_cnts, + cumsum, + num_experts, + block_size, + numel, + tokens_per_thread, + ) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + block_size: int, + num_experts: int, + expert_map: torch.Tensor = None +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns the token distribution across experts to be compatible with block + size for matrix multiplication. + + Parameters: + - topk_ids: A tensor of shape [total_tokens, top_k] representing the + top-k expert indices for each token. + - block_size: The block size used in block matrix multiplication. + - num_experts: The total number of experts. + - expert_map: A tensor of shape [num_experts] that maps the expert index + from the global space to the local index space of the current + expert parallel shard. If the expert is not in the current expert + parallel shard, the mapping is set to -1. + + Returns: + - sorted_token_ids: A tensor containing the sorted token indices according + to their allocated expert. + - expert_ids: A tensor indicating the assigned expert index for each block. + - num_tokens_post_padded: The total number of tokens after padding, + ensuring divisibility by block_size. + + This function pads the number of tokens that each expert needs to process + so that it is divisible by block_size. + Padding ensures that during block matrix multiplication, the dimensions + align correctly. + + Example: + Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], + block_size = 4, and num_experts = 4: + - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, + with each expert needing to process 3 tokens. + - As block_size is 4, we pad 1 token for each expert. + - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. + - Then append padding tokens [12, 12, 12, 12] for each block. + - After sorting by expert index, we obtain token_ids + [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. + Tokens 12 are non-existent (padding) and are ignored in + the subsequent matrix multiplication. + - The padding ensures that the total number of tokens is now divisible + by block_size for proper block matrix operations. + """ + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + sorted_ids = torch.empty((max_num_tokens_padded, ), + dtype=torch.int32, + device=topk_ids.device) + sorted_ids.fill_(topk_ids.numel()) + max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) + # Expert ids must be zeroed out to prevent index out of bounds error while + # mapping global expert ids to local expert ids in expert parallelism. + expert_ids = torch.zeros((max_num_m_blocks, ), + dtype=torch.int32, + device=topk_ids.device) + num_tokens_post_pad = torch.empty((1), + dtype=torch.int32, + device=topk_ids.device) + if num_experts >= 224: + if num_experts != 256: + moe_align_block_size_triton( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + # Currently requires num_experts=256 + # Note: sgl_moe_align_block_size has 10x performance compared to moe_align_block_size, + # but this functionis removed since vllm.0.9.2. vLLM will check this issue. + # if hasattr(ops, 'sgl_moe_align_block_size'): + # # ops.sgl_moe_align_block_size( + # topk_ids, + # num_experts, + # block_size, + # sorted_ids, + # expert_ids, + # num_tokens_post_pad, + # ) + # else: + # ops.moe_align_block_size( + # topk_ids, + # num_experts, + # block_size, + # sorted_ids, + # expert_ids, + # num_tokens_post_pad, + # ) + sgl_moe_align_block_size_aiter( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + # ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, + # expert_ids, num_tokens_post_pad) + moe_align_block_size_aiter(topk_ids, num_experts, block_size, sorted_ids, + expert_ids, num_tokens_post_pad) + if expert_map is not None: + expert_ids = expert_map[expert_ids] + + return sorted_ids, expert_ids, num_tokens_post_pad + +def moe_sorting_ck( + topk_ids, + topk_weights, + num_experts, + model_dim, + moebuf_dtype, + block_size=32, + expert_mask=None, +): + device = topk_ids.device + M, topk = topk_ids.shape + topk = topk_ids.shape[1] + max_num_tokens_padded = topk_ids.numel() + num_experts * block_size - topk + max_num_m_blocks = int((max_num_tokens_padded + block_size - 1) // block_size) + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=dtypes.i32, device=device) + sorted_weights = torch.empty( + (max_num_tokens_padded,), dtype=dtypes.fp32, device=device + ) + sorted_expert_ids = torch.empty( + (max_num_m_blocks,), dtype=dtypes.i32, device=device + ) + tokens_positions_per_expert = torch.empty( + (num_experts*2,), dtype=dtypes.i32, device=device + ) + num_valid_ids = torch.empty((1), dtype=dtypes.i32, device=device) + moe_buf = torch.empty((M, model_dim), dtype=moebuf_dtype, device=device) + + # for now, moe_sorting_fwd only support int32 topk_ids + if topk_ids.dtype != dtypes.i32: + topk_ids = topk_ids.to(dtypes.i32) + + moe_sorting_fwd( + topk_ids, + topk_weights, + sorted_ids, + sorted_weights, + sorted_expert_ids, + tokens_positions_per_expert, + num_valid_ids, + moe_buf, + num_experts, + block_size, + expert_mask, + ) + return sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, tokens_positions_per_expert, moe_buf + + +def inplace_fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: Optional[str] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None) -> None: + if activation is None: + activation = "silu" + fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, + activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, global_num_experts, expert_map, + w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, + block_shape) + +def outplace_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: Optional[str] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None) -> torch.Tensor: + if activation is None: + activation = "silu" + return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, + False, activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, + use_int4_w4a16, global_num_experts, expert_map, + w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, + a2_scale, block_shape) + +def fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: Optional[str] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None) -> torch.Tensor: + if activation is None: + activation = 'silu' + if inplace: + torch.ops.vllm.inplace_fused_experts( + hidden_states, w1, w2, topk_weights, topk_ids, activation, + use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, + expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, + block_shape) + return hidden_states + else: + return torch.ops.vllm.outplace_fused_experts( + hidden_states, w1, w2, topk_weights, topk_ids, activation, + use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, + expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, + block_shape) + +def fused_moe_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + odtype:torch.dtype, #compute or output type for i8& f8 + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a8: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0, + fn_key: Optional[str] = None, +) -> torch.Tensor: + device = topk_ids.device + M, topk = topk_ids.shape + dtype = odtype + # E, model_dim, inter_dim = get_inter_dim(w1.shape, w2.shape) + # FIXME: W2.size must be same as hidden_dim + moe_buf = torch.empty(hidden_states.shape, dtype=dtype, device=device) + return moe_buf +@functools.lru_cache() +def _bottom_moe_use_mls(): + return support_mls() + + +@torch_compile_guard(gen_fake=fused_moe_fake) +def fused_experts_impl(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + odtype:torch.dtype, #compute or output type for i8& f8 + inplace: bool = False, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a8: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + block_shape: Optional[List[int]] = None, + routed_scaling_factor: Optional[float] = 1.0)-> torch.Tensor: + if routed_scaling_factor is None: + routed_scaling_factor = 1.0 + + # Check constraints. + if use_int4_w4a16 or use_int4_w4a8: + assert hidden_states.shape[1] // 2 == w1.shape[ + 2], "Hidden size mismatch" + else: + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn + ] + + num_tokens, _ = hidden_states.shape + E, N, _ = w1.shape + if global_num_experts == -1: + global_num_experts = E + top_k_num = topk_ids.shape[1] + # We execute the fused_moe kernel in chunks to circumvent this issue: + # https://github.com/vllm-project/vllm/issues/5938 + CHUNK_SIZE = int(os.environ.get("TRITON_FUSED_MOE_CHUNK_SIZE", "16384")) + M = min(num_tokens, CHUNK_SIZE) + + moe_config_func = get_optimal_moe_config_func( + hidden_states, w1, topk_ids, + use_int8_w8a16=use_int8_w8a16, + use_int8_w8a8=use_int8_w8a8, + use_fp8_w8a8=use_fp8_w8a8, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + use_mxfp4_w4a4=False, #always false in wna16 + block_shape=block_shape, + is_bottom=False) + moe_config_func2 = get_optimal_moe_config_func( + hidden_states, w2, topk_ids, + use_int8_w8a16=use_int8_w8a16, + use_int8_w8a8=use_int8_w8a8, + use_fp8_w8a8=use_fp8_w8a8, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + use_mxfp4_w4a4=False, #always false in wna16 + block_shape=block_shape, + is_bottom=True) + config = moe_config_func(M) + config2, max_block_m = moe_config_func2(M) + + # config["COMBINE_SCALE_LOAD"] = False + # config2["COMBINE_SCALE_LOAD"] = False + # config["USE_MLS_LOAD"] = True + # config2["USE_MLS_LOAD"] = True + + bottom_moe_a_use_mls = ( + _bottom_moe_use_mls() + and not use_int4_w4a8 + and config2 is not None + and config2.get("USE_MLS_LOAD", False)) + # bottom_moe_a_use_mls = False + + topk = top_k_num + max_padded_tokens = ( + min(M * topk, E + 1) * (max_block_m - 1) if bottom_moe_a_use_mls else 0 + ) + total_tokens = M * topk + max_padded_tokens + + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + if expert_map is not None: + cache13 = torch.zeros(total_tokens * max(N, w2.shape[1]), + device=hidden_states.device, + dtype=odtype) + else: + cache13 = torch.empty(total_tokens * max(N, w2.shape[1]), + device=hidden_states.device, + dtype=odtype) + intermediate_cache3 = cache13[:M * topk * w2.shape[1]].view( + (M, topk, w2.shape[1])) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + elif hidden_states.dtype == torch.int8 or hidden_states.dtype == torch.float8_e4m3fn: + if odtype == torch.bfloat16: + compute_type = tl.bfloat16 + else: + compute_type = tl.float16 + else: + raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") + + if inplace: + out_hidden_states = hidden_states + else: + out_hidden_states = torch.empty(hidden_states.shape, device=hidden_states.device, dtype=odtype) + + for chunk in range((num_tokens // CHUNK_SIZE) + 1): + begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE, + min((chunk + 1) * CHUNK_SIZE, + num_tokens)) + curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] + tokens_in_chunk, _ = curr_hidden_states.shape + + if tokens_in_chunk == 0: + break + + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + # Adjust the intermediate cache size and config for the last + # chunk. Note that in most cases we only have one chunk + # so the cache size and config are already set correctly and + # do not need to be adjusted. + config = moe_config_func(tokens_in_chunk) + config2, max_block_m = moe_config_func2(tokens_in_chunk) + bottom_moe_a_use_mls = ( + _bottom_moe_use_mls() + and config2 is not None + and config2.pop("USE_MLS_LOAD", False) + and (block_shape is not None and (use_int8_w8a8 or use_fp8_w8a8))) + intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] + + padded_tokens = ( + min(tokens_in_chunk * topk, E + 1) * (config["BLOCK_SIZE_M"] - 1) + if bottom_moe_a_use_mls + else 0 + ) + total_tokens = tokens_in_chunk * topk + padded_tokens + intermediate_cache1 = cache13[: total_tokens * N].view( + (total_tokens, N), + ) + if expert_map is not None: + intermediate_cache2 = torch.zeros( + (total_tokens, N // 2 if activation != "relu2" else N), + device=hidden_states.device, + dtype=odtype) + else: + intermediate_cache2 = torch.empty( + (total_tokens, N // 2 if activation != "relu2" else N), + device=hidden_states.device, + dtype=odtype) + + curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] + curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] + + ck_sorting = expert_map is None # TODO: check why expert_map failed issue. + sorted_weights = None + if not ck_sorting: + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'], + global_num_experts, expert_map)) + else: + sorted_token_ids, sorted_weights, expert_ids, num_tokens_post_padded, \ + _tokens_positions_per_expert, _moe_buf = ( + moe_sorting_ck(curr_topk_ids, curr_topk_weights, global_num_experts, + w2.shape[1], odtype, config["BLOCK_SIZE_M"], expert_map) + ) + + if (use_int8_w8a8 or use_fp8_w8a8 or use_int4_w4a8) and per_channel_quant: + quant_dtype = torch.float8_e4m3fn if use_fp8_w8a8 else torch.int8 + if curr_hidden_states.dtype == torch.float16 or curr_hidden_states.dtype==torch.bfloat16: + input_q,input_scale = per_token_quant_hip(curr_hidden_states,quant_dtype=quant_dtype) + else: + input_q,input_scale = curr_hidden_states,a1_scale + invoke_fused_moe_kernel(input_q, + w1, + intermediate_cache1, + input_scale, + w1_scale, + w1_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + False, + top_k_num, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + c_sorted=bottom_moe_a_use_mls, + ck_sorting=ck_sorting, + ck_topk=top_k_num, + config=config) + elif block_shape is not None and (use_int8_w8a8 or use_int4_w4a8 or use_fp8_w8a8): + quant_dtype = torch.float8_e4m3fn if use_fp8_w8a8 else torch.int8 + if curr_hidden_states.dtype == torch.float16 or curr_hidden_states.dtype==torch.bfloat16: + input_q, input_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(curr_hidden_states,quant_dtype=quant_dtype) + else: + input_q, input_scale = curr_hidden_states,a1_scale + invoke_fused_moe_kernel(input_q, + w1, + intermediate_cache1, + input_scale, + w1_scale, + w1_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + False, + top_k_num, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + c_sorted=bottom_moe_a_use_mls, + ck_sorting=ck_sorting, + ck_topk=top_k_num, + config=config) + else: + invoke_fused_moe_kernel(curr_hidden_states, + w1, + intermediate_cache1, + a1_scale, + w1_scale, + w1_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + False, + top_k_num, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + c_sorted=bottom_moe_a_use_mls, + ck_sorting=ck_sorting, + ck_topk=top_k_num, + config=config) + if activation == "silu": + triton_silu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) + # torch.ops._C.silu_and_mul(intermediate_cache2, + # intermediate_cache1.view(-1, N)) + elif activation == "gelu": + triton_gelu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) + elif activation == "relu2": + triton_relu2( + intermediate_cache2, + intermediate_cache1.view(-1, N), + ) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + if expert_map != None: + # for EP mode, intermediate_cache1 and intermediate_cache3 need be zeros inited + # since intermediate_cache1 and intermediate_cache3 shared same buffer, + # to make sure intermediate_cache3 is zeros inited, + # intermediate_cache1 need inited to zeros after silu_and_mul + intermediate_cache1.fill_(0) + + if (use_int8_w8a8 or use_fp8_w8a8 or use_int4_w4a8) and per_channel_quant: + quant_dtype = torch.float8_e4m3fn if use_fp8_w8a8 else torch.int8 + bridge_q, bridge_scale = per_token_quant_hip(intermediate_cache2, quant_dtype=quant_dtype) + invoke_fused_moe_kernel(bridge_q, + w2, + intermediate_cache3, + bridge_scale, + w2_scale, + w2_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + True, + 1, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + bottom_a_use_mls_load=bottom_moe_a_use_mls, + ck_sorting=ck_sorting, + ck_topk=top_k_num, + config=config2) + elif block_shape is not None and (use_int8_w8a8 or use_int4_w4a8 or use_fp8_w8a8): + quant_dtype = torch.float8_e4m3fn if use_fp8_w8a8 else torch.int8 + bridge_q, bridge_scale = per_block_quant_wrapper((1,block_shape[1]))(per_token_quant_hip)(intermediate_cache2, quant_dtype=quant_dtype) + invoke_fused_moe_kernel(bridge_q, + w2, + intermediate_cache3, + bridge_scale, + w2_scale, + w2_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + True, + 1, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + bottom_a_use_mls_load=bottom_moe_a_use_mls, + ck_sorting=ck_sorting, + ck_topk=top_k_num, + config=config2) + else: + invoke_fused_moe_kernel(intermediate_cache2, + w2, + intermediate_cache3, + a2_scale, + w2_scale, + w2_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + True, + 1, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + bottom_a_use_mls_load=bottom_moe_a_use_mls, + ck_sorting=ck_sorting, + ck_topk=top_k_num, + config=config2) + + mode_use_triton_moe_sum = out_hidden_states.dtype == torch.float16 or \ + out_hidden_states.dtype == torch.bfloat16 or \ + out_hidden_states.dtype == torch.float32 + if mode_use_triton_moe_sum: + triton_moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx], + routed_scaling_factor=routed_scaling_factor) + else: + moe_sum(intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states[begin_chunk_idx:end_chunk_idx]) + if routed_scaling_factor != 1.0: + out_hidden_states[begin_chunk_idx:end_chunk_idx].mul_( + routed_scaling_factor) + if end_chunk_idx < num_tokens and expert_map != None: + # if has next chunk, intermediate_cache3 need init to zeros + intermediate_cache3.fill_(0) + return out_hidden_states diff --git a/aiter/ops/triton/fused_mul_add.py b/aiter/ops/triton/fused_mul_add.py new file mode 100644 index 0000000000000000000000000000000000000000..5c4e5614f5d649f612987b228abf1447d3954d4d --- /dev/null +++ b/aiter/ops/triton/fused_mul_add.py @@ -0,0 +1,131 @@ +import torch +import triton +import triton.language as tl +from typing import Optional + + +@triton.jit +def _fused_mul_add_kernel( + x_ptr, + a_ptr, + b_ptr, + out_ptr, + N, + BLOCK_SIZE_N: tl.constexpr, + NEED_MASK: tl.constexpr, + IS_A_SCALAR: tl.constexpr, + IS_B_SCALAR: tl.constexpr, + IS_A_TENSOR: tl.constexpr, + IS_B_TENSOR: tl.constexpr, +): + pid = tl.program_id(0) + + x_offs = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + x_mask = None + if NEED_MASK: + x_mask = x_offs < N + + x = tl.load(x_ptr + x_offs, mask=x_mask).to(tl.float32) + + if IS_A_SCALAR and IS_A_TENSOR: + a = tl.load(a_ptr) + elif IS_A_SCALAR: + a = a_ptr + else: + a = tl.load(a_ptr + x_offs, mask=x_mask) + a = a.to(tl.float32) + + if IS_B_SCALAR and IS_B_TENSOR: + b = tl.load(b_ptr) + elif IS_B_SCALAR: + b = b_ptr + else: + b = tl.load(b_ptr + x_offs, mask=x_mask) + b = b.to(tl.float32) + + out = a * x + b + out = out.to(out_ptr.dtype.element_ty) + out = tl.store(out_ptr + x_offs, out, mask=x_mask) + + +def fused_mul_add( + x: torch.Tensor, + a: torch.Tensor | float | int, + b: torch.Tensor | float | int, + out: Optional[torch.Tensor] = None, +): + """ + Computes elementwise multiplicated and addtion: out = x * a + b + + Key parameters: + - x: must be a torch.Tensor, but with arbitrary shape, + - a: can be float, int, or torch.Tensor with shape (1, ) or the same shape as x + - b: can be float, int, or torch.Tensor with shape (1, ) or the same shape as x + + all tensors must be contiguous + + if out is None, the kernel will perform inplace computation on x instead of creating a new tensor + + Returns: + - out: same shape as x + """ + + N = x.numel() + assert x.is_contiguous(), "x should be contiguous" + assert ( + isinstance(a, float) + or isinstance(a, int) + or (isinstance(a, torch.Tensor) and a.is_contiguous() and a.numel() in [1, N]) + ), "a should be a scalar or contiguous tensor with the same number of elements as x" + assert ( + isinstance(b, float) + or isinstance(b, int) + or (isinstance(b, torch.Tensor) and b.is_contiguous() and b.numel() in [1, N]) + ), "b should be a scalar or contiguous tensor with the same number of elements as x" + + if out is None: + out = x + else: + assert ( + out.is_contiguous() and out.numel() == N + ), "out should be contiguous with the same number of elements as x" + + if isinstance(a, float) or isinstance(a, int): + IS_A_SCALAR = True + IS_A_TENSOR = False + elif isinstance(a, torch.Tensor) and a.is_contiguous(): + IS_A_TENSOR = True + if a.numel() == 1: + IS_A_SCALAR = True + else: + IS_A_SCALAR = False + if isinstance(b, float) or isinstance(b, int): + IS_B_SCALAR = True + IS_B_TENSOR = False + elif isinstance(b, torch.Tensor) and b.is_contiguous(): + IS_B_TENSOR = True + if b.numel() == 1: + IS_B_SCALAR = True + else: + IS_B_SCALAR = False + + BLOCK_SIZE_N = max(min(triton.next_power_of_2(N), 32), 1024) + grid = (triton.cdiv(N, BLOCK_SIZE_N),) + _fused_mul_add_kernel[grid]( + x, + a, + b, + out, + N, + BLOCK_SIZE_N=BLOCK_SIZE_N, + NEED_MASK=N % BLOCK_SIZE_N != 0, + IS_A_SCALAR=IS_A_SCALAR, + IS_B_SCALAR=IS_B_SCALAR, + IS_A_TENSOR=IS_A_TENSOR, + IS_B_TENSOR=IS_B_TENSOR, + num_warps=4, + waves_per_eu=0, + ) + + return out diff --git a/aiter/ops/triton/fused_mxfp4_quant.py b/aiter/ops/triton/fused_mxfp4_quant.py new file mode 100644 index 0000000000000000000000000000000000000000..01596e2e8514c128ebd634017fc2d769b0b0f91e --- /dev/null +++ b/aiter/ops/triton/fused_mxfp4_quant.py @@ -0,0 +1,300 @@ +import torch +import triton +import triton.language as tl + +from aiter.ops.triton.quant import _mxfp4_quant_op + + +@triton.jit +def _rmsmorm_op(row, weight, n_cols, epsilon): + row_norm = row * row + row_norm = tl.sum(row_norm, axis=-1) + norm_factor = tl.math.rsqrt((row_norm / n_cols) + epsilon) + + rms_norm = row * norm_factor * weight + return rms_norm + + +@triton.jit +def _fused_rms_mxfp4_quant_kernel( + inp1_ptr, + weight1_ptr, + inp2_ptr, + weight2_ptr, + res1_ptr, + out1_fp4_ptr, + out1_bs_ptr, + out2_ptr, + out_res1_ptr, + eps1, + eps2, + n_rows, + inp1_n_cols, + inp2_n_cols, + inp1_row_stride, + inp2_row_stride, + res1_row_stride, + out1_fp4_row_stride, + out1_bs_row_stride, + out1_bs_col_stride, + out2_row_stride, + out_res1_row_stride, + BLOCK_SIZE: tl.constexpr, + MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, + SKIP_SECOND_INPUT: tl.constexpr, + FIRST_INPUT_RES: tl.constexpr, +): + pid = tl.program_id(0) + NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE // MXFP4_QUANT_BLOCK_SIZE + block_inds = tl.arange(0, BLOCK_SIZE) + + mask1 = block_inds < inp1_n_cols + inp1 = tl.load( + inp1_ptr + pid * inp1_row_stride + block_inds, + mask=mask1, + other=0.0, + cache_modifier=".cg", + ).to(tl.float32) + if FIRST_INPUT_RES: + res1 = tl.load( + res1_ptr + pid * res1_row_stride + block_inds, + mask=mask1, + other=0.0, + cache_modifier=".cg", + ).to(tl.float32) + inp1 = inp1 + res1 + + w1 = tl.load(weight1_ptr + block_inds, mask=mask1, other=0.0).to(tl.float32) + + norm1 = _rmsmorm_op(inp1, w1, inp1_n_cols, eps1) + out1_fp4, out1_block_scales = _mxfp4_quant_op( + norm1, BLOCK_SIZE, 1, MXFP4_QUANT_BLOCK_SIZE + ) + out1_fp4 = tl.ravel(out1_fp4) + out1_block_scales = tl.ravel(out1_block_scales) + + # store the results + half_block_inds = tl.arange(0, BLOCK_SIZE // 2) + tl.store( + out1_fp4_ptr + pid * out1_fp4_row_stride + half_block_inds, + out1_fp4, + mask=half_block_inds < (inp1_n_cols // 2), + ) + bs_inds = tl.arange(0, NUM_QUANT_BLOCKS) + num_bs_cols = (inp1_n_cols + MXFP4_QUANT_BLOCK_SIZE - 1) // MXFP4_QUANT_BLOCK_SIZE + tl.store( + out1_bs_ptr + pid * out1_bs_row_stride + bs_inds * out1_bs_col_stride, + out1_block_scales, + mask=bs_inds < num_bs_cols, + ) + if not SKIP_SECOND_INPUT: + mask2 = block_inds < inp2_n_cols + inp2 = tl.load( + inp2_ptr + pid * inp2_row_stride + block_inds, + mask=mask2, + other=0.0, + cache_modifier=".cg", + ).to(tl.float32) + w2 = tl.load(weight2_ptr + block_inds, mask=mask2, other=0.0).to(tl.float32) + norm2 = _rmsmorm_op(inp2, w2, inp2_n_cols, eps2) + tl.store(out2_ptr + pid * out2_row_stride + block_inds, norm2, mask=mask2) + if FIRST_INPUT_RES: + inp1 = inp1.to(out_res1_ptr.dtype.element_ty) + tl.store( + out_res1_ptr + pid * out_res1_row_stride + block_inds, inp1, mask=mask1 + ) + + +def fused_rms_mxfp4_quant( + inp1, + inp1_weight, + inp1_epsilon, + inp2=None, + inp2_weight=None, + inp2_epsilon=0.0, + res1=None, +): + """ + This op contains several steps: + 1. if res1 is not None, inp1 = inp1 + res1, and store inp1 to out_res1 + 2. perform RMS norm along the last dimenion for inp1 + 3. if inp2 is not None, perform RMS norm along the last dimenion for inp2 + 4. perform mxfp4 quantization for inp1 only + + Key parameters: + - x: Matrix X with shape (M, N1, N2). + + Returns: + - out1_fp4: The output matrix with shape (M, N1 // 2). + - out1_bs: The output matrix with shape (M, cdiv(N1, MXFP4_QUANT_BLOCK_SIZE)). + - out2: The output matrix with shape (M, N2). + - out_res1: The output matrix with shape (M, N1). + + if both inp2 and res1 provided, return (out1_fp4, out1_bs), out2, out_res1 + if inp2 provided, return (out1_fp4, out1_bs), out2 + if res1 provided, return (out1_fp4, out1_bs), out_res1 + if both inp2 and res1 not provided, return (out1_fp4, out1_bs) + """ + + MXFP4_QUANT_BLOCK_SIZE = 32 + M, N1 = inp1.shape + BLOCK_SIZE = max(triton.next_power_of_2(N1), MXFP4_QUANT_BLOCK_SIZE) + if inp2 is not None: + N2 = inp2.shape[1] + BLOCK_SIZE = max(triton.next_power_of_2(N2), BLOCK_SIZE) + else: + N2 = 0 + # as we merge 2 fp4s to 1 uint8 + assert N1 % 2 == 0 + + BLOCK_SIZE = max(BLOCK_SIZE, MXFP4_QUANT_BLOCK_SIZE) + out1_fp4 = torch.empty((M, N1 // 2), dtype=torch.uint8, device=inp1.device) + out1_bs = torch.empty( + ((N1 + MXFP4_QUANT_BLOCK_SIZE - 1) // MXFP4_QUANT_BLOCK_SIZE, M), + dtype=torch.uint8, + device=inp1.device, + ).T + + out_res1 = None + res1_row_stride = 0 + out_res1_row_stride = 0 + if res1 is not None: + out_res1 = torch.empty((M, N1), dtype=inp1.dtype, device=inp1.device) + res1_row_stride = res1.stride(0) + out_res1_row_stride = out_res1.stride(0) + + out2 = None + out2_row_stride = 0 + inp2_row_stride = 0 + if inp2 is not None: + out2 = torch.empty((M, N2), dtype=inp1.dtype, device=inp1.device) + inp2_row_stride = inp2.stride(0) + out2_row_stride = out2.stride(0) + + _fused_rms_mxfp4_quant_kernel[(M,)]( + inp1, + inp1_weight, + inp2, + inp2_weight, + res1, + out1_fp4, + out1_bs, + out2, + out_res1, + inp1_epsilon, + inp2_epsilon, + M, + N1, + N2, + inp1.stride(0), + inp2_row_stride, + res1_row_stride, + out1_fp4.stride(0), + *out1_bs.stride(), + out2_row_stride, + out_res1_row_stride, + BLOCK_SIZE=BLOCK_SIZE, + MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, + SKIP_SECOND_INPUT=(inp2 is None), + FIRST_INPUT_RES=(res1 is not None), + ) + if res1 is not None: + if inp2 is None: + return (out1_fp4, out1_bs), out_res1 + else: + return (out1_fp4, out1_bs), out2, out_res1 + else: + if inp2 is None: + return (out1_fp4, out1_bs) + else: + return (out1_fp4, out1_bs), out2 + + +@triton.jit +def _fused_flatten_mxfp4_quant( + x_ptr, + out_ptr, + out_scales_ptr, + x_stride_m, + x_stride_n1, + x_stride_n2, + out_stride_m, + out_stride_n, + out_scales_stride_m, + out_scales_stride_n, + N2, + BLOCK_SIZE_N2: tl.constexpr, + MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, +): + m = tl.program_id(0) + n1 = tl.program_id(1) + + NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE_N2 // MXFP4_QUANT_BLOCK_SIZE + n2_offs = tl.arange(0, BLOCK_SIZE_N2) + x_offs = m * x_stride_m + n1 * x_stride_n1 + n2_offs * x_stride_n2 + x = tl.load(x_ptr + x_offs, mask=n2_offs < N2) + + out, out_block_scales = _mxfp4_quant_op(x, BLOCK_SIZE_N2, 1, MXFP4_QUANT_BLOCK_SIZE) + out = tl.ravel(out) + out_block_scales = tl.ravel(out_block_scales) + + half_block_offs = tl.arange(0, BLOCK_SIZE_N2 // 2) + tl.store( + out_ptr + + m * out_stride_m + + (n1 * (BLOCK_SIZE_N2 // 2) + half_block_offs) * out_stride_n, + out, + mask=half_block_offs < (N2 // 2), + ) + block_scale_offs = tl.arange(0, NUM_QUANT_BLOCKS) + tl.store( + out_scales_ptr + + m * out_scales_stride_m + + (n1 * NUM_QUANT_BLOCKS + block_scale_offs) * out_scales_stride_n, + out_block_scales, + mask=block_scale_offs < tl.cdiv(N2, MXFP4_QUANT_BLOCK_SIZE), + ) + + +def fused_flatten_mxfp4_quant( + x: torch.Tensor, +): + """ + Flatten the last two dimension of x and perform mxfp4 quantization along the last dimension + + Key parameters: + - x: Matrix X with shape (M, N1, N2). + + Returns: + - out: The output matrix with shape (M, (N1 * N2) // 2). + - out_block_scales: The output matrix with shape (M, cdiv(N1 * N2, MXFP4_QUANT_BLOCK_SIZE)). + """ + M, N1, N2 = x.shape + + MXFP4_QUANT_BLOCK_SIZE = 32 + BLOCK_SIZE_N2 = max(triton.next_power_of_2(N2), MXFP4_QUANT_BLOCK_SIZE) + N = N1 * N2 + out = torch.empty((M, N // 2), dtype=torch.uint8, device=x.device) + out_block_scales = torch.empty( + (triton.cdiv(N, MXFP4_QUANT_BLOCK_SIZE), M), + dtype=torch.uint8, + device=x.device, + ).T + + grid = ( + M, + N1, + ) + _fused_flatten_mxfp4_quant[grid]( + x, + out, + out_block_scales, + *x.stride(), + *out.stride(), + *out_block_scales.stride(), + N2, + BLOCK_SIZE_N2, + MXFP4_QUANT_BLOCK_SIZE, + ) + + return out, out_block_scales diff --git a/aiter/ops/triton/fused_qk_concat.py b/aiter/ops/triton/fused_qk_concat.py new file mode 100644 index 0000000000000000000000000000000000000000..e14bbea4f1f530c4dda27970323d00a056743f00 --- /dev/null +++ b/aiter/ops/triton/fused_qk_concat.py @@ -0,0 +1,426 @@ +import torch +import triton +import triton.language as tl +from aiter.ops.triton.rope import _get_gptj_rotated_x_1D, _get_neox_rotated_x_1D + + +@triton.jit +def _unit_cat( + x1_ptr, + x2_ptr, + x_out_ptr, + b, + h, + d1_offs, + d2_offs, + x1_stride_b, + x1_stride_h, + x1_stride_d, + x2_stride_b, + x2_stride_h, + x2_stride_d, + x_out_stride_b, + x_out_stride_h, + x_out_stride_d, + BLOCK_D1: tl.constexpr, +): + x1_offs = b * x1_stride_b + h * x1_stride_h + d1_offs * x1_stride_d + x2_offs = b * x2_stride_b + h * x2_stride_h + d2_offs * x2_stride_d + x_out_offs = b * x_out_stride_b + h * x_out_stride_h + + x1 = tl.load(x1_ptr + x1_offs) + x2 = tl.load(x2_ptr + x2_offs) + + tl.store(x_out_ptr + x_out_offs + d1_offs * x_out_stride_d, x1) + tl.store(x_out_ptr + x_out_offs + (d2_offs + BLOCK_D1) * x_out_stride_d, x2) + + +@triton.jit +def _qk_cat_kernel( + q1_ptr, + q2_ptr, + k1_ptr, + k2_ptr, + q_out_ptr, + k_out_ptr, + q1_stride_b, + q1_stride_h, + q1_stride_d, + q2_stride_b, + q2_stride_h, + q2_stride_d, + k1_stride_b, + k1_stride_h, + k1_stride_d, + k2_stride_b, + k2_stride_h, + k2_stride_d, + q_out_stride_b, + q_out_stride_h, + q_out_stride_d, + k_out_stride_b, + k_out_stride_h, + k_out_stride_d, + QH_PER_KH: tl.constexpr, + BLOCK_D1: tl.constexpr, + BLOCK_D2: tl.constexpr, +): + pid_b = tl.program_id(0) + pid_hq = tl.program_id(1) + + d1_offs = tl.arange(0, BLOCK_D1) + d2_offs = tl.arange(0, BLOCK_D2) + + _unit_cat( + q1_ptr, + q2_ptr, + q_out_ptr, + pid_b, + pid_hq, + d1_offs, + d2_offs, + q1_stride_b, + q1_stride_h, + q1_stride_d, + q2_stride_b, + q2_stride_h, + q2_stride_d, + q_out_stride_b, + q_out_stride_h, + q_out_stride_d, + BLOCK_D1, + ) + + if pid_hq % QH_PER_KH == 0: + _unit_cat( + k1_ptr, + k2_ptr, + k_out_ptr, + pid_b, + pid_hq // QH_PER_KH, + d1_offs, + d2_offs, + k1_stride_b, + k1_stride_h, + k1_stride_d, + k2_stride_b, + k2_stride_h, + k2_stride_d, + k_out_stride_b, + k_out_stride_h, + k_out_stride_d, + BLOCK_D1, + ) + + +def fused_qk_cat( + q1: torch.Tensor, + q2: torch.Tensor, + k1: torch.Tensor, + k2: torch.Tensor, +): + """ + Concat q1 with q2 and k1 with k2 along the last dimension + + Key parameters: + - q1: Matrix X with shape (B, QH, D1). + - q2: Matrix W with shape (B, QH, D2). + - k1: Matrix X with shape (B, KH, D1). + - k2: Matrix W with shape (B, KH, D2). + + QH must be multiple of KH + + Returns: + - q_out: The output matrix with shape (B, QH, D1+D2). + - k_out: The output matrix with shape (B, KH, D1+D2). + """ + b, qh, d1 = q1.shape + b2, qh2, d2 = q2.shape + bk, kh, dk1 = k1.shape + bk2, kh2, dk2 = k2.shape + assert ( + b == b2 == bk == bk2 + ), "q1 batch dimension should be identical across all inputs" + assert qh == qh2, "Q head should be identical" + assert kh == kh2, "K head should be identical" + assert d1 == dk1, "D dimension of q1 and k1 should be identical" + assert d2 == dk2, "D dimension of q2 and k2 should be identical" + assert qh % kh == 0, "Number of Q heads must be multiple of number H heads" + + q_out = torch.empty((b, qh, d1 + d2), dtype=q1.dtype, device=q1.device) + k_out = torch.empty((b, kh, d1 + d2), dtype=q1.dtype, device=q1.device) + + grid = (b, qh, 1) + + _qk_cat_kernel[grid]( + q1, + q2, + k1, + k2, + q_out, + k_out, + *q1.stride(), + *q2.stride(), + *k1.stride(), + *k2.stride(), + *q_out.stride(), + *k_out.stride(), + QH_PER_KH=qh // kh, + BLOCK_D1=d1, + BLOCK_D2=d2, + ) + + return q_out, k_out + + +@triton.jit +def _unit_rope_cat( + x_nope_ptr, + x_pe_ptr, + cos, + sin, + x_out_ptr, + b, + h, + d_nope_offs, + d_pe_offs, + x_nope_stride_b, + x_nope_stride_h, + x_nope_stride_d, + x_pe_stride_b, + x_pe_stride_h, + x_pe_stride_d, + x_out_stride_b, + x_out_stride_h, + x_out_stride_d, + IS_NEOX: tl.constexpr, + BLOCK_D_nope: tl.constexpr, + BLOCK_D_pe: tl.constexpr, + BLOCK_D_HALF_pe: tl.constexpr, +): + x_nope_offs = ( + b * x_nope_stride_b + h * x_nope_stride_h + d_nope_offs * x_nope_stride_d + ) + x_pe_offs = b * x_pe_stride_b + h * x_pe_stride_h + d_pe_offs * x_pe_stride_d + x_out_offs = b * x_out_stride_b + h * x_out_stride_h + + x_nope = tl.load(x_nope_ptr + x_nope_offs) + x_pe = tl.load(x_pe_ptr + x_pe_offs) + + if IS_NEOX: + x_rotated_mask = d_pe_offs < BLOCK_D_HALF_pe + x_pe_rotated = _get_neox_rotated_x_1D( + x_pe, x_rotated_mask, BLOCK_D_pe, BLOCK_D_HALF_pe + ) + else: + x_rotated_mask = d_pe_offs % 2 == 0 + x_pe_rotated = _get_gptj_rotated_x_1D( + x_pe, x_rotated_mask, BLOCK_D_pe, BLOCK_D_HALF_pe + ) + + x_pe = x_pe * cos + x_pe_rotated * sin + x_pe = x_pe.to(x_pe_ptr.dtype.element_ty) + + tl.store(x_out_ptr + x_out_offs + d_nope_offs * x_out_stride_d, x_nope) + tl.store(x_out_ptr + x_out_offs + (d_pe_offs + BLOCK_D_nope) * x_out_stride_d, x_pe) + + +@triton.jit +def _qk_rope_cat_kernel( + q_nope_ptr, + q_pe_ptr, + k_nope_ptr, + k_pe_ptr, + pos_ptr, + cos_ptr, + sin_ptr, + q_out_ptr, + k_out_ptr, + q_nope_stride_b, + q_nope_stride_h, + q_nope_stride_d, + q_pe_stride_b, + q_pe_stride_h, + q_pe_stride_d, + k_nope_stride_b, + k_nope_stride_h, + k_nope_stride_d, + k_pe_stride_b, + k_pe_stride_h, + k_pe_stride_d, + pos_stride_b, + cos_stride_b, + cos_stride_d, + q_out_stride_b, + q_out_stride_h, + q_out_stride_d, + k_out_stride_b, + k_out_stride_h, + k_out_stride_d, + QH_PER_KH: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + BLOCK_D_nope: tl.constexpr, + BLOCK_D_pe: tl.constexpr, + BLOCK_D_HALF_pe: tl.constexpr, +): + pid_b = tl.program_id(0) + pid_hq = tl.program_id(1) + + d_nope_offs = tl.arange(0, BLOCK_D_nope) + d_pe_offs = tl.arange(0, BLOCK_D_pe) + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_pe_offs + d_cos_offs = tl.where( + (d_cos_offs >= BLOCK_D_HALF_pe) & (d_cos_offs < BLOCK_D_pe), + d_cos_offs - BLOCK_D_HALF_pe, + d_cos_offs, + ).to(d_cos_offs.dtype) + # d_cos_mask = d_cos_offs < BLOCK_D_pe + else: + d_cos_offs = d_pe_offs // 2 + # d_cos_mask = d_cos_offs < BLOCK_D_HALF_pe + else: + d_cos_offs = d_pe_offs + # d_cos_mask = d_cos_offs < BLOCK_D_pe + + pos = tl.load(pos_ptr + pid_b * pos_stride_b) + cos_offs = pos * cos_stride_b + d_cos_offs * cos_stride_d + cos = tl.load(cos_ptr + cos_offs) + sin = tl.load(sin_ptr + cos_offs) + + _unit_rope_cat( + q_nope_ptr, + q_pe_ptr, + cos, + sin, + q_out_ptr, + pid_b, + pid_hq, + d_nope_offs, + d_pe_offs, + q_nope_stride_b, + q_nope_stride_h, + q_nope_stride_d, + q_pe_stride_b, + q_pe_stride_h, + q_pe_stride_d, + q_out_stride_b, + q_out_stride_h, + q_out_stride_d, + IS_NEOX, + BLOCK_D_nope, + BLOCK_D_pe, + BLOCK_D_HALF_pe, + ) + + if pid_hq % QH_PER_KH == 0: + _unit_rope_cat( + k_nope_ptr, + k_pe_ptr, + cos, + sin, + k_out_ptr, + pid_b, + pid_hq // QH_PER_KH, + d_nope_offs, + d_pe_offs, + k_nope_stride_b, + k_nope_stride_h, + k_nope_stride_d, + k_pe_stride_b, + k_pe_stride_h, + k_pe_stride_d, + k_out_stride_b, + k_out_stride_h, + k_out_stride_d, + IS_NEOX, + BLOCK_D_nope, + BLOCK_D_pe, + BLOCK_D_HALF_pe, + ) + + +def fused_qk_rope_cat( + q_nope: torch.Tensor, + q_pe: torch.Tensor, + k_nope: torch.Tensor, + k_pe: torch.Tensor, + pos: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox: bool, +): + """ + Perform RoPE on q_pe and k_pe and concat q_nope with q_pe and k_nope with k_pe along the last dimension + + Key parameters: + - q_nope: Matrix X with shape (B, QH, D1). + - q_pe: Matrix W with shape (B, QH, D2). + - k_nope: Matrix X with shape (B, KH, D1). + - k_pe: Matrix W with shape (B, KH, D2). + + QH must be multiple of KH + + Returns: + - q_out: The output matrix with shape (B, QH, D1+D2). + - k_out: The output matrix with shape (B, KH, D1+D2). + """ + b, qh, d_nope = q_nope.shape + b2, qh2, d_pe = q_pe.shape + bk, kh, dk1 = k_nope.shape + bk2, kh2, dk2 = k_pe.shape + + assert ( + b == b2 == bk == bk2 + ), "q1 batch dimension should be identical across all inputs" + assert qh == qh2, "Q head should be identical" + assert kh == kh2, "K head should be identical" + assert d_nope == dk1, "D dimension of q_nope and k_nope should be identical" + assert d_pe == dk2, "D dimension of q_pe and k_pe should be identical" + assert qh % kh == 0, "Q heads must be multiple of H heads" + d_freq = cos.shape[-1] + assert (d_freq == d_pe // 2) or ( + d_freq == d_pe + ), "cos/sin last dim should be the same or half of the qk last dim" + reuse_freqs_front_part = d_freq == d_pe // 2 + + q_out = torch.empty( + (b, qh, d_nope + d_pe), dtype=q_nope.dtype, device=q_nope.device + ) + k_out = torch.empty( + (b, kh, d_nope + d_pe), dtype=q_nope.dtype, device=q_nope.device + ) + + grid = (b, qh, 1) + + _qk_rope_cat_kernel[grid]( + q_nope, + q_pe, + k_nope, + k_pe, + pos, + cos, + sin, + q_out, + k_out, + *q_nope.stride(), + *q_pe.stride(), + *k_nope.stride(), + *k_pe.stride(), + pos.stride(0), + cos.stride(0), + cos.stride(-1), + *q_out.stride(), + *k_out.stride(), + QH_PER_KH=qh // kh, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=is_neox, + BLOCK_D_nope=d_nope, + BLOCK_D_pe=d_pe, + BLOCK_D_HALF_pe=d_pe // 2, + ) + + return q_out, k_out diff --git a/aiter/ops/triton/gemm_a16w16.py b/aiter/ops/triton/gemm_a16w16.py new file mode 100644 index 0000000000000000000000000000000000000000..a1e6edc31ad43ec0e86a3cc208734b8e6a3094e6 --- /dev/null +++ b/aiter/ops/triton/gemm_a16w16.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import os +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_a16_w16_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, + cache_modifier: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + remap_xcd(pid, GRID_MN) + + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + # Create pointers for first block of A and B input matrices + offs_k = tl.arange(0, BLOCK_SIZE_K) + offs_am = (pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + acc_dtype = tl.float32 if c_ptr.type.element_ty != tl.int8 else tl.int32 + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load( + b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0, + cache_modifier=cache_modifier, + ) + + accumulator += tl.dot(a, b, input_precision="ieee") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16W16.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + key = f"{N}_{K}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16W16-N={N}-K={K}.json" + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = "default" # fall back to default config + + if M < 128 and "small" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["small"] + else: + return _get_config._config_dict[key]["any"] + + +def gemm_a16w16( + x, + w, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the 16 bit matmul Y = X x W + + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - dtype: Optional parameter to specifcy bf16 or fp16 datatype. Default is bf16 + - Y: Output Matrix Y with shape (M, N). If this is none, then it's created by this API and returned as output + + Returns: + - Y: The output matrix with shape (M, N). + """ + + M, K = x.shape + N, K = w.shape + w = w.T + + if y is None: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + grid = lambda META: ( # noqa: E731 + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + _gemm_a16_w16_kernel[grid]( + x, + w, + y, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + y.stride(0), + y.stride(1), + **config, + ) + + return y diff --git a/aiter/ops/triton/gemm_a16w16_atomic.py b/aiter/ops/triton/gemm_a16w16_atomic.py new file mode 100644 index 0000000000000000000000000000000000000000..ab623a3ee88cf59d565d519a5ed9d0f8e94ac817 --- /dev/null +++ b/aiter/ops/triton/gemm_a16w16_atomic.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import os + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"]) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"]) == 0), + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_a16_w16_atomic_kernel( + a_ptr, + b_ptr, + c_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + cache_modifier: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_unified = tl.program_id(axis=0) + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if NUM_KSPLIT == 1: + pid = remap_xcd(pid, GRID_MN) + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + tl.assume(pid_k >= 0) + if (pid_k * SPLITK_BLOCK_SIZE) < K: + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE, BLOCK_SIZE_K) + + # Create pointers for first block of A and B input matrices + offs_k = tl.arange(0, BLOCK_SIZE_K) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE) + offs_k + offs_am = (pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn + ) + + acc_dtype = tl.float32 if c_ptr.type.element_ty != tl.int8 else tl.int32 + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0 + ) + b = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0 + ) + + accumulator += tl.dot(a, b, input_precision="ieee") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + if NUM_KSPLIT == 1: + tl.store(c_ptrs, c, mask=c_mask) + else: + tl.atomic_add(c_ptrs, c, mask=c_mask, sem="relaxed") + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16W16-ATOMIC.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + key = f"{N}_{K}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16W16-ATOMIC-N={N}-K={K}.json" + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = "default" # fall back to default config + # single config. for the default path + return _get_config._config_dict[key]["any"] + if M < 32: + return _get_config._config_dict[key]["small"] + elif M <= 128: + BLK_M = triton.next_power_of_2(M) + if BLK_M == 32: + return _get_config._config_dict[key]["medium_M32"] + elif BLK_M == 64: + return _get_config._config_dict[key]["medium_M64"] + elif BLK_M == 128: + return _get_config._config_dict[key]["medium_M128"] + elif M <= 256: + return _get_config._config_dict[key]["large"] + else: + return _get_config._config_dict[key]["xlarge"] + + +def gemm_a16w16_atomic( + x, + w, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the 16 bit matmul Y = X x W + NOTE: If dtype is set to bf16, aggregation in bf16 with atomic_add will lead to slight precision loss. + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - dtype: Optional parameter to specifcy bf16 or fp16 datatype. Default is bf16 + - Y: Output Matrix Y with shape (M, N). If this is none, then it's created by this API and returned as output + + Returns: + - Y: The output matrix with shape (M, N). + """ + w = w.T + + M, K = x.shape + K, N = w.shape + + if config is None: + config = _get_config(M, N, K) + # For compatability reasons, these keys may not exist in the config + # TODO: This needs to be embedded in the configs later + if "NUM_KSPLIT" not in config: + config["NUM_KSPLIT"] = 1 + if "cache_modifier" not in config: + config["cache_modifier"] = "" + + if y is None: + # atomic add requires 0 tensor + if config["NUM_KSPLIT"] == 1: + y = torch.empty((M, N), dtype=dtype, device=x.device) + else: + y = torch.zeros((M, N), dtype=dtype, device=x.device) + + grid = lambda META: ( # noqa: E731 + triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + * META["NUM_KSPLIT"], + ) + # NOTE: if k split doesnt divide K evenly, this will waste compute + SPLITK_BLOCK_SIZE = triton.cdiv(K, config["NUM_KSPLIT"]) + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + _gemm_a16_w16_atomic_kernel[grid]( + x, + w, + y, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + y.stride(0), + y.stride(1), + **config, + ) + + return y diff --git a/aiter/ops/triton/gemm_a16w4.py b/aiter/ops/triton/gemm_a16w4.py new file mode 100644 index 0000000000000000000000000000000000000000..c11d0d90283b2def0dd3a8e38ffa12080108b6e6 --- /dev/null +++ b/aiter/ops/triton/gemm_a16w4.py @@ -0,0 +1,1029 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +import json +import logging +import functools +from functools import partial +from typing import Any, Dict, List, Optional, Tuple +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import aiter.ops.triton.utils.arch_info as arch_info +from aiter import logger + +import torch +import triton +import triton.language as tl + +from triton.language.extra.hip import libdevice + +AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] + +def reverse_awq_order(tensor: torch.Tensor) -> torch.Tensor: + """Reverse the AWQ order of the given tensor. + + Args: + tensor: Input tensor to reorder + + Returns: + Reordered tensor with bits masked to 4 bits + """ + bits = 4 + AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] + reverse_order_tensor = torch.arange( + tensor.shape[-1], + dtype=torch.int32, + device=tensor.device, + ) + reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits) + reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER] + reverse_order_tensor = reverse_order_tensor.view(-1) + + tensor = tensor[:, reverse_order_tensor] & 0xF + return tensor + +def awq_reorder_and_repack( + qweight: torch.Tensor, + qzeros: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Reorder and pack weights and zeros using AWQ order. + + This function unpacks the 4-bit quantized weights and zeros from int32, + applies reverse_awq_order to reorder them, and then packs them. + For weight, repack to [N, K//2] + For zeros, repack to [K//G, N//2] + Args: + qweight: Quantized weight tensor of shape [K, N // 8] with dtype int32 + qzeros: Quantized zero points tensor of shape [K // G, N // 8] with dtype int32 + + Returns: + Tuple of (reordered_qweight, reordered_qzeros) both with dtype int8 + """ + bits = 4 + shifts = torch.arange(0, 32, bits, device=qweight.device) + K = qweight.shape[0] + N = qweight.shape[1] * 8 + G = K // qzeros.shape[0] + + # Unpack weights: [K, N//8] -> [K, N//8, 8] -> [K, N] + iweights = torch.bitwise_right_shift( + qweight[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + iweights = iweights.view(K, -1) + + # Unpack zeros: [K//G, N//8] -> [K//G, N//8, 8] -> [K//G, N] + zeros = torch.bitwise_right_shift( + qzeros[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + zeros = zeros.view(K//G, -1) + + # Apply reverse AWQ order to both tensors + iweights = reverse_awq_order(iweights) + zeros = reverse_awq_order(zeros) + + # Mask to 4 bits + iweights = torch.bitwise_and(iweights, (2**bits) - 1) + zeros = torch.bitwise_and(zeros, (2**bits) - 1) + + # Repack weight to int32 and pack along the K direction + # [K, N] -> [N, K] + iweights = iweights.transpose(1, 0).contiguous() + # Reshape to [N, K//2, 2] for weights + iweights_packed = iweights.view(N, -1, 2) + + # Repack zeros to int8 and pack along the N direction + # Reshape to [K//G, N//2, 2] for zeros + zeros_packed = zeros.view(K//G, -1, 2) + + # Pack 2 int4 values into int8 using bit shifts + # Direct packing: pack in the order they appear after reordering + packed_weights = torch.zeros([N, K//2], dtype=torch.int8, device=qweight.device) + packed_zeros = torch.zeros([K//G, N//2], dtype=torch.int8, device=zeros.device) + + for i in range(2): + packed_weights |= (iweights_packed[:, :, i].to(torch.int8) << (i * bits)) + packed_zeros |= (zeros_packed[:, :, i].to(torch.int8) << (i * bits)) + + return packed_weights, packed_zeros + +''' +@triton.autotune( + configs=[ + triton.Config({ + "BLOCK_SIZE_N": BN, + "BLOCK_SIZE_K": BK + }, num_warps=num_warps, num_stages=num_stages) + for BN in [16, 32, 64, 128, 256] + for BK in [16, 32, 64, 128, 256] + for num_warps in [1, 2, 4, 8, 16] for num_stages in [1, 2] + ], + key=["K", "N"], + perf_debug=True, +) +''' +@triton.heuristics(values={ + "NUM_GROUPS": lambda args: triton.cdiv(args["BLOCK_SIZE_K"], args["group_size"]), + "BLOCK_SIZE_K2": lambda args: args["BLOCK_SIZE_K"] // 2 +}) +@triton.jit +def awq_dequantize_kernel( + qweight_ptr, # quantized matrix + scales_ptr, # scales, per group + zeros_ptr, # zeros, per group + result_ptr, # Output matrix + N, + N2, + K, + K2, + group_size: tl.constexpr, # Should always be one of the supported group sizes + NUM_GROUPS: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + BLOCK_SIZE_K2: tl.constexpr): + + # Setup the pids. + pid_n = tl.program_id(axis=0) + pid_k = tl.program_id(axis=1) + + tl.assume(pid_n >= 0) + tl.assume(pid_k >= 0) + tl.assume(N > 0) + tl.assume(K > 0) + tl.assume(N2 > 0) + tl.assume(K2 > 0) + tl.assume(BLOCK_SIZE_N > 0) + tl.assume(BLOCK_SIZE_K > 0) + tl.assume(BLOCK_SIZE_K2 > 0) + tl.assume(group_size > 0) + + # Compute offsets and masks for qweight_ptr. + offsets_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offsets_n = tl.max_contiguous(tl.multiple_of(offsets_n, BLOCK_SIZE_N), BLOCK_SIZE_N) + offsets_k = pid_k * BLOCK_SIZE_K2 + tl.arange(0, BLOCK_SIZE_K2) + offsets_k = tl.max_contiguous(tl.multiple_of(offsets_k, BLOCK_SIZE_K2), BLOCK_SIZE_K2) + offsets = K2 * offsets_n[:, None] + offsets_k[None, :] + + masks_n = offsets_n < N + masks_k = offsets_k < K2 + + masks = masks_n[:, None] & masks_k[None, :] + + # Compute offsets and masks for result output ptr. + result_offsets_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + result_offsets_k = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + result_offsets = (N * result_offsets_k[:, None] + result_offsets_n[None, :]) # [K, N] + + result_masks_n = result_offsets_n < N + result_masks_k = result_offsets_k < K + result_masks = result_masks_k[:, None] & result_masks_n[None, :] + + # Load the weights. + iweights = tl.load(qweight_ptr + offsets, masks, 0.0) #[BLOCK_SIZE_N, BLOCK_SIZE_K//2] + iweights = tl.interleave(iweights, iweights) # [BLOCK_SIZE_N, BLOCK_SIZE_K] + + # Use this to compute a set of shifts that can be used to unpack and + # reorder the values in iweights and zeros. + shifts = tl.arange(0, 2) * 4 + bshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_N * BLOCK_SIZE_K2, 2)) + bshifts = tl.reshape(bshifts, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + # Unpack and reorder: shift out the correct 4-bit value and mask. + iweights = (iweights >> bshifts) & 0xF + + # Compute zero offsets and masks. + zero_offsets_k = pid_k * BLOCK_SIZE_K // group_size + tl.arange(0, NUM_GROUPS) + zero_offsets_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + zero_offsets_n2 = zero_offsets_n // 2 + zero_offsets = N2 * zero_offsets_k[:, None] + zero_offsets_n2[None, :] + + zero_masks_k = zero_offsets_k < K//group_size + zero_masks_n = zero_offsets_n < N + zero_masks = zero_masks_k[:, None] & zero_masks_n[None, :] + + # Load the zeros. + zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0) # [NUM_GROUPS, BLOCK_SIZE_N] + + # Compute scale offsets and masks. + scale_offsets_k = pid_k * BLOCK_SIZE_K // group_size + tl.arange(0, NUM_GROUPS) + scale_offsets_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) + scale_offsets = N * scale_offsets_k[:, None] + scale_offsets_n[None, :] + scale_masks_k = scale_offsets_k < K//group_size + scale_masks_n = scale_offsets_n < N + scale_masks = scale_masks_k[:, None] & scale_masks_n[None, :] + + # Load the scales. + scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0) # [NUM_GROUPS, BLOCK_SIZE_N] + + if NUM_GROUPS == 1: + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) # [BLOCK_SIZE_K, BLOCK_SIZE_N] + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) # [BLOCK_SIZE_K, BLOCK_SIZE_N] + else: + zeros = tl.broadcast_to(zeros[:, None, :], (NUM_GROUPS, group_size, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales[:, None, :], (NUM_GROUPS, group_size, BLOCK_SIZE_N)) + zeros = tl.reshape(zeros, [BLOCK_SIZE_K, BLOCK_SIZE_N]) + scales = tl.reshape(scales, [BLOCK_SIZE_K, BLOCK_SIZE_N]) + + # Unpack and reorder: shift out the correct 4-bit value and mask. + zshifts = (zero_offsets_n[None, :] % 2) * 4 # [1, BLOCK_SIZE_N] + zeros = (zeros >> zshifts) & 0xF # [BLOCK_SIZE_K, BLOCK_SIZE_N] + + # Dequantize. + iweights = (iweights.T - zeros) * scales + iweights = iweights.to(result_ptr.type.element_ty) + + # Finally, store. + tl.store(result_ptr + result_offsets, iweights, result_masks) + +@triton.jit +def awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, not_reduce, GROUP_SIZE: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr = 0): + + if not USE_REDUCE_KERNEL: + k_idx = 0 + + tl.assume(tile_idx >= 0) + tl.assume(k_idx >= 0) + tl.assume(iter_begin >= 0) + tl.assume(iter_end >= 0) + tl.assume(M > 0) + tl.assume(N > 0) + tl.assume(K > 0) + tl.assume(K2 > 0) + tl.assume(N2 > 0) + + num_tile_m = tl.cdiv(M, BLOCK_SIZE_M) + num_tile_n = tl.cdiv(N, BLOCK_SIZE_N) + + tile_idx_m = tile_idx // num_tile_n + tile_idx_n = tile_idx % num_tile_n + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7] + # that will map given indices to the correct order. + #reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]).reshape(8) + + # Create the necessary shifts to use to unpack. + #shifts = reverse_awq_order_tensor * 4 + shifts = tl.arange(0, 2) * 4 + + #zshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_K * (BLOCK_SIZE_N // 2), 2)) + #zshifts = tl.reshape(zshifts, (BLOCK_SIZE_K, BLOCK_SIZE_N)).T + + #bshifts = tl.broadcast_to(shifts[:, None], (8, (BLOCK_SIZE_K // 8) * BLOCK_SIZE_N)) + bshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_N * (BLOCK_SIZE_K // 2), 2)) + bshifts = tl.reshape(bshifts, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + # Offsets and masks. + offsets_am = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + masks_am = offsets_am < M + + #offsets_zn = tile_idx_n * (BLOCK_SIZE_N // 2) + tl.arange(0, BLOCK_SIZE_N // 2) + offsets_zn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offsets_zn = offsets_zn // 2 + #offsets_zn = tl.max_contiguous(tl.multiple_of(offsets_zn, BLOCK_SIZE_N // 2), BLOCK_SIZE_N // 2) + masks_zn = offsets_zn < N2 + + offsets_bn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + #offsets_bn = tl.max_contiguous(tl.multiple_of(offsets_bn, BLOCK_SIZE_N), BLOCK_SIZE_N) + masks_bn = offsets_bn < N + + offsets_sn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + #offsets_sn = tl.max_contiguous(tl.multiple_of(offsets_sn, BLOCK_SIZE_N), BLOCK_SIZE_N) + masks_sn = offsets_sn < N + + offsets_ak = iter_begin * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + #offsets_ak = tl.max_contiguous(tl.multiple_of(offsets_ak, BLOCK_SIZE_K), BLOCK_SIZE_K) + offsets_a = K * offsets_am[:, None] + offsets_ak[None, :] + + offsets_bk = iter_begin * (BLOCK_SIZE_K // 2) + tl.arange(0, BLOCK_SIZE_K // 2) + #offsets_bk = tl.max_contiguous(tl.multiple_of(offsets_bk, BLOCK_SIZE_K // 2), BLOCK_SIZE_K // 2) + #offsets_b = offsets_bk[:, None] + K // 2 * offsets_bn[None, :] + #offsets_b = K // 2 * offsets_bn[:, None] + offsets_bk[None, :] + offsets_b = K2 * offsets_bn[:, None] + offsets_bk[None, :] + zshifts = (offsets_bn[:, None] % 2) * 4 # [N, 1] + zshifts = zshifts.T + + a_ptrs = a_ptr + offsets_a + b_ptrs = b_ptr + offsets_b + for k in range(iter_end - iter_begin): + masks_ak = offsets_ak < K + masks_bk = offsets_bk < K2 + masks_a = masks_am[:, None] & masks_ak[None, :] + masks_b = masks_bn[:, None] & masks_bk[None, :] + other_bzs = 0.0 + a = tl.load(a_ptrs, mask=masks_a, other=0.) + b = tl.load(b_ptrs, masks_b, other_bzs) #[N, K//2] + b = tl.interleave(b, b) # [N, K] + + # Dequantize b. + offsets_szk = ((BLOCK_SIZE_K * k + iter_begin * BLOCK_SIZE_K) // GROUP_SIZE + tl.arange(0, NUM_GROUPS)) + masks_szk = offsets_szk < K // GROUP_SIZE + masks_z = masks_szk[:, None] & masks_zn[None, :] + masks_s = masks_szk[:, None] & masks_sn[None, :] + #masks_z = masks_zn[:, None] & masks_szk[None, :] + #masks_s = masks_sn[:, None] & masks_szk[None, :] + + offsets_z = N2 * offsets_szk[:, None] + offsets_zn[None, :] + #offsets_z = K // GROUP_SIZE * offsets_zn[:, None] + offsets_szk[None, :] + zeros_ptrs = zeros_ptr + offsets_z + zeros = tl.load(zeros_ptrs, mask=masks_z, other=other_bzs) # [K//G, N] + #zshifts = (offsets_bn[:, None] % 2) * 4 # [N, 1] + #zeros = (zeros >> _zshifts) & 0xF # [N, K//G] + + ''' + zeros = zeros.T # [K//G, N//2] + zeros = tl.interleave(zeros, zeros) # [K//G, N] + zeros = zeros.T # [N, K//G] + ''' + + offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :] + #offsets_s = K // GROUP_SIZE * offsets_sn[:, None] + offsets_szk[None, :] + scales_ptrs = scales_ptr + offsets_s + scales = tl.load(scales_ptrs, mask=masks_s, other=other_bzs) # [K//G, N] + + if NUM_GROUPS == 1: + # Original efficient implementation for single group + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + #zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + #scales = tl.broadcast_to(scales, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + else: + # Reshape to (NUM_GROUPS, 1, N) then broadcast to (NUM_GROUPS, group_size_in_block, N) + # Reshape to (K//G, 1, N) then broadcast to (K//G, group_size_in_block, N) + zeros = tl.broadcast_to(zeros[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + ## Reshape back to (BLOCK_SIZE_K, N) + zeros = tl.reshape(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.reshape(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + # Reshape to (N, K//G, 1) then broadcast to (N, K//G, group_size_in_block) + #zeros = tl.broadcast_to(zeros[:, :, None], (BLOCK_SIZE_N, NUM_GROUPS, GROUP_SIZE)) + #scales = tl.broadcast_to(scales[:, :, None], (BLOCK_SIZE_N, NUM_GROUPS, GROUP_SIZE)) + #zeros = tl.reshape(zeros, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + #scales = tl.reshape(scales, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + b = (b >> bshifts) & 0xF + b = b.T + zeros = (zeros >> zshifts) & 0xF + b = (b - zeros) * scales + b = b.to(a_ptr.type.element_ty) + + # Accumulate results. + accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32) + + offsets_ak += BLOCK_SIZE_K + offsets_bk += BLOCK_SIZE_K // 2 + a_ptrs += BLOCK_SIZE_K + b_ptrs += BLOCK_SIZE_K // 2 + + c = accumulator.to(c_ptr.type.element_ty) + offs_cm = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # compiler hints + offs_cm = tl.max_contiguous(tl.multiple_of(offs_cm, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_cn = tl.max_contiguous(tl.multiple_of(offs_cn, BLOCK_SIZE_N), BLOCK_SIZE_N) + offs_c = M * N * k_idx + N * offs_cm[:, None] + offs_cn[None, :] + c_ptrs = c_ptr + offs_c + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + if USE_REDUCE_KERNEL: + tl.store(c_ptrs, c, mask=c_mask) + else: + if c_ptr.type.element_ty == tl.float16: + tl.store(c_ptrs, c, mask=c_mask) + elif not_reduce: + tl.store(c_ptrs, c, mask=c_mask) + else: + tl.atomic_add(c_ptrs, c, mask=c_mask) + +@triton.jit +def awq_gemm_kernel_streamk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, NUM_CUS: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, + NUM_GROUPS: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + + if pid < DP_TILES: + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K) + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, pid, 0, 0, iters_per_cta, + M, N, N2, K, K2, True, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + else: + iters_per_tile = tl.cdiv(K, BLOCK_SIZE_K) + total_iters = iters_per_tile * DANGLING_TILES + iters_per_cta = tl.cdiv(total_iters, NUM_CUS) + + iter_begin = (pid - DP_TILES) * iters_per_cta + iter_end = tl.minimum(iter_begin + iters_per_cta, total_iters) + + while iter_begin < iter_end: + tile_idx = iter_begin // iters_per_tile + DP_TILES + tile_iter_begin = (tile_idx - DP_TILES) * iters_per_tile + tile_iter_end = tile_iter_begin + iters_per_tile + local_iter_begin = iter_begin - tile_iter_begin + local_iter_end = tl.minimum(iter_end, tile_iter_end) - tile_iter_begin + k_idx = tl.cdiv(local_iter_begin, iters_per_cta) + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, local_iter_begin, local_iter_end, + M, N, N2, K, K2, False, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + iter_begin = tile_iter_end + +@triton.jit +def awq_gemm_kernel_splitk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, NUM_CUS: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + SPLITK: tl.constexpr, NUM_GROUPS: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + + tiles_M = tl.cdiv(M, BLOCK_SIZE_M) + tiles_N = tl.cdiv(N, BLOCK_SIZE_N) + total_tiles = tiles_M * tiles_N + tile_idx = pid % total_tiles + + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K * SPLITK) + iter_begin = pid // total_tiles * iters_per_cta + iter_end = iter_begin + iters_per_cta + k_idx = tl.cdiv(iter_begin, iters_per_cta) + + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, False, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + +@triton.jit +def awq_gemm_kernel_splitk_fused( + a_ptr, b_ptr, c_ptr, + zeros_ptr, scales_ptr, + out_ptr, barrier_ptr, + M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + SPLITK: tl.constexpr, + NUM_GROUPS: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + tiles_M = tl.cdiv(M, BLOCK_SIZE_M) + tiles_N = tl.cdiv(N, BLOCK_SIZE_N) + total_tiles = tiles_M * tiles_N + + if pid < total_tiles * SPLITK: + tile_idx = pid % total_tiles + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K * SPLITK) + iter_begin = pid // total_tiles * iters_per_cta + iter_end = iter_begin + iters_per_cta + k_idx = tl.cdiv(iter_begin, iters_per_cta) + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, False, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + # set barriers + tile_idx_m = tile_idx // tiles_N + tile_idx_n = tile_idx % tiles_N + offset = total_tiles * k_idx + tiles_N * tile_idx_m + tile_idx_n + tl.store(barrier_ptr + offset, 1, cache_modifier=".wt") + else: + pid = pid - total_tiles * SPLITK + # reduce kernel + pid_m = pid // tiles_N + pid_n = pid % tiles_N + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # compiler hints + offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_n = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N) + + mask_m = offs_m < M + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + # reduce split k + for batch_idx in range(SPLITK): + batch_offset = batch_idx * M * N + input_offsets = batch_offset + offs_m[:, None] * N + offs_n[None, :] + # wait barrier + offset = total_tiles * batch_idx + pid_m * tiles_N + pid_n + while tl.load(barrier_ptr + offset, cache_modifier=".cv", volatile=True) != 1: + pass + input_data = tl.load(c_ptr + input_offsets, mask=mask, other=0.0) + acc += input_data + + output_offsets = offs_m[:, None] * N + offs_n[None, :] + acc_f16 = acc.to(tl.float16) + tl.store(out_ptr + output_offsets, acc_f16, mask=mask) + +''' +@triton.autotune( + configs=[ + triton.Config({ + }, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8, 16] for num_stages in [1, 2] + ], + key=["M", "K", "N", "GROUP_SIZE", "BLOCK_SIZE_M", "BLOCK_SIZE_N", "BLOCK_SIZE_K", "SCHEDULER", "SPLITK"], + perf_debug=True, + #enable=int(os.getenv("TRITON_DO_AUTOTUNING", 0)) == 1, + #prune_configs_by={ + # "early_config_prune": lambda configs, nargs, **kwargs: [ + # config for config in configs + # # SCHEDULE=1 代表 STREAMK,不需要遍历那么多 SPLITK 的值 + # if config.all_kwargs()["SCHEDULER"] == 0 or (config.all_kwargs()["SCHEDULER"] == 1 and config.all_kwargs()["SPLITK"] == 1) + # ] + #} +) +@triton.heuristics(values={ + "NUM_GROUPS": lambda args: triton.cdiv(args["BLOCK_SIZE_K"], args["GROUP_SIZE"]) +}) +''' +@triton.jit +def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, NUM_CUS: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, + SPLITK: tl.constexpr, SCHEDULER: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr): + if SCHEDULER == 0: + return awq_gemm_kernel_splitk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, GROUP_SIZE, NUM_CUS, BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + SPLITK, NUM_GROUPS, USE_REDUCE_KERNEL) + else: + return awq_gemm_kernel_streamk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, GROUP_SIZE, NUM_CUS, BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + DP_TILES, DANGLING_TILES, NUM_GROUPS, USE_REDUCE_KERNEL) + +@triton.jit +def awq_gemm_kernel_fused( + a_ptr, b_ptr, c_ptr, + zeros_ptr, + scales_ptr, + out_ptr, barrier_ptr, + M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + # NUM_GEMM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, + SPLITK: tl.constexpr, SCHEDULER: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr): + if SCHEDULER == 0: + return awq_gemm_kernel_splitk_fused( + a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, + out_ptr, barrier_ptr, + M, N, N2, K, K2, + GROUP_SIZE, + NUM_CUS, + # NUM_GEMM_CUS, + BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + SPLITK, NUM_GROUPS, USE_REDUCE_KERNEL) + else: + # TODO: to be supported + assert False + return awq_gemm_kernel_streamk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, GROUP_SIZE, NUM_CUS, BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + DP_TILES, DANGLING_TILES, NUM_GROUPS, USE_REDUCE_KERNEL) + +# qweights - [N , K // 2], int8 +# scales - [K // G, N ], float16 +# zeros - [K // G, N // 2], int8 +# result - [K, N], float16 +def awq_dequantize_triton(qweight: torch.Tensor, + scales: torch.Tensor, + zeros: torch.Tensor, + **kwargs) -> torch.Tensor: + N = qweight.shape[0] + K = qweight.shape[1] * 2 + group_size = K // scales.shape[0] + + assert K > 0 and N > 0 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert zeros.shape[0] == K // group_size and zeros.shape[1] == N // 2 + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + configs = { + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "num_warps": 4, + "num_stages": 1 + } + + result = torch.empty(K, N, device=qweight.device, dtype=scales.dtype) + + grid = lambda META: ( + triton.cdiv(N, META['BLOCK_SIZE_N']), + triton.cdiv(K, META['BLOCK_SIZE_K']), + ) + awq_dequantize_kernel[grid](qweight, + scales, + zeros, + result, + N, + N//2, + K, + K//2, + group_size, + **configs + ) + + return result + + +@functools.lru_cache +def get_w4a16_awq_gemm_config_filepath(N: int, K: int, GROUP_SIZE: int, **kwargs) -> str: + device_name = arch_info.get_device() + if device_name.lower().startswith("bw"): + device_name = "BW200" + if "k100" in device_name.lower(): + device_name = "K100_AI" + json_file_name = f"awq_gemm_N={N},K={K},device_name={device_name},dtype=w4a16,group_size={GROUP_SIZE}.json" + + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "gemm/awq_w4a16", json_file_name + ) + return config_file_path + +@functools.lru_cache +def get_w4a16_awq_gemm_configs( + N: int, K: int, GROUP_SIZE: int +) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + config_file_path = get_w4a16_awq_gemm_config_filepath(N, K, GROUP_SIZE) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + f"\nUsing default W4A16 AWQ GEMM kernel config. Performance might " + f"be sub-optimal! Config file not found at {config_file_path}") + return None + +# The inference function +# input - [m, k] +# qweight - [n, k // 2] +# qzeros - [k//g, n//2] +# scales - [k//g, n] +def gemm_a16w4(input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + use_fused_kernel: int = 0, + configs: Optional[Dict] = None) -> torch.tensor: + # not_used_placeholder: int = 0) -> torch.tensor: + + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + group_size = K // qzeros.shape[0] + + default_config = { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SCHEDULER": 0, + "SPLITK": 1, + "D_SHAPE": (M, N), + "D_DTYPE": 16, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "NUM_CUS": 0, + "NUM_CUS_STREAMK": 0, + "NUM_GROUPS":(32 + group_size - 1) // group_size, + "USE_REDUCE_KERNEL": False + } + if configs is None: + configs = get_w4a16_awq_gemm_configs(N, K, group_size) + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] if configs else default_config + # Make sure not getting this wrong from other configs + d_shape = list(config["D_SHAPE"]) + d_shape[-2] = M + config["D_SHAPE"] = d_shape + # if use_fused_kernel == 1 and config["SPLITK"] > 1 and config["USE_REDUCE_KERNEL"]: + # return awq_gemm_triton_fused_impl( + # input, qweight, scales, qzeros, config, config.copy(), awq_gemm_kernel_fused) + + return awq_gemm_triton_impl(input, qweight, scales, qzeros, config, config.copy(), awq_gemm_kernel) + +def awq_gemm_triton_fused_impl( + input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + config: Dict, + cfg4kernel: Dict, + func) -> torch.tensor: + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + assert(qweight.is_contiguous()) + group_size = qweight.shape[1] * 2 // qzeros.shape[0] + + assert N > 0 and K > 0 and M > 0 + assert qweight.shape[1] == K // 2 and qweight.shape[0] == N + assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 2 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + num_cus = config["NUM_CUS"] + d_shape = config["D_SHAPE"] + d_dtype = config["D_DTYPE"] + d_dtype = torch.float16 if d_dtype == 16 else torch.float32 + + def grid(META): + # tiles_M = (M + META["BLOCK_SIZE_M"] - 1) // META["BLOCK_SIZE_M"] + # tiles_N = (N + META["BLOCK_SIZE_N"] - 1) // META["BLOCK_SIZE_N"] + tiles_M = triton.cdiv(M, META["BLOCK_SIZE_M"]) + tiles_N = triton.cdiv(N, META["BLOCK_SIZE_N"]) + total_tiles = tiles_M * tiles_N + if META["SCHEDULER"] == 0: + # dp or splitk + # add extra total_tiles for reduction + return (total_tiles * META["SPLITK"] + total_tiles,) + else: + # TODO: not supported yet + # streamk + return (META["DP_TILES"] + config["NUM_CUS_STREAMK"],) + + result = torch.zeros(d_shape, dtype=d_dtype, device=input.device) + + cfg4kernel.pop("D_SHAPE", None) + cfg4kernel.pop("D_DTYPE", None) + cfg4kernel.pop("NUM_CUS_STREAMK", None) + + fn = func[grid] + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + fn = partial(func.warmup, grid=grid) + + total_tiles_splitk = \ + triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(N, config["BLOCK_SIZE_N"]) * config["SPLITK"] + final_result = torch.zeros((M, N), dtype=torch.float16, device=input.device) + barrier = torch.zeros((total_tiles_splitk, ), dtype=torch.float16, device=input.device) + + fn(input, + qweight, + result, + qzeros, + scales, + final_result, # new added + barrier, # new added + M, + N, + N//2, + K, + K//2, + group_size, + **cfg4kernel) + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + return + + return final_result + +def awq_gemm_triton_impl(input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + config: Dict, + cfg4kernel: Dict, + func) -> torch.tensor: + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + assert(qweight.is_contiguous()) + group_size = qweight.shape[1] * 2 // qzeros.shape[0] + + assert N > 0 and K > 0 and M > 0 + assert qweight.shape[1] == K // 2 and qweight.shape[0] == N + assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 2 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + num_cus = config["NUM_CUS"] + d_shape = config["D_SHAPE"] + d_dtype = config["D_DTYPE"] + d_dtype = torch.float16 if d_dtype == 16 else torch.float32 + + #curr_num_cus = torch.cuda.get_device_properties("cuda").multi_processor_count + #if num_cus > 0 and num_cus != curr_num_cus: + # print("AWQ_GEMM config tuned based on num_cus={num_cus}, but now running on num_cus={curr_num_cus}, may lead to bad performance!") + + def grid(META): + tiles_M = (M + META["BLOCK_SIZE_M"] - 1) // META["BLOCK_SIZE_M"] + tiles_N = (N + META["BLOCK_SIZE_N"] - 1) // META["BLOCK_SIZE_N"] + total_tiles = tiles_M * tiles_N + if META["SCHEDULER"] == 0: + # dp or splitk + return (total_tiles * META["SPLITK"],) + else: + # streamk + return (META["DP_TILES"] + config["NUM_CUS_STREAMK"],) + + if cfg4kernel["USE_REDUCE_KERNEL"]: + result = torch.zeros(d_shape, dtype=d_dtype, device=input.device) + elif cfg4kernel["DP_TILES"] > 0 or cfg4kernel["SPLITK"] > 1: + result = torch.zeros(d_shape, dtype=d_dtype, device=input.device) + else: + result = torch.empty(d_shape, dtype=d_dtype, device=input.device) + + cfg4kernel.pop("D_SHAPE", None) + cfg4kernel.pop("D_DTYPE", None) + cfg4kernel.pop("NUM_CUS_STREAMK", None) + + fn = func[grid] + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + fn = partial(func.warmup, grid=grid) + + fn(input, + qweight, + result, + qzeros, + scales, + M, + N, + N//2, + K, + K//2, + group_size, + **cfg4kernel) + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + return + + if result.ndim == 3: + batch_size = result.shape[0] + final_result = torch.empty((M, N), dtype=torch.float16, device=input.device) + awq_reduce_and_convert_triton(result, final_result, M, N, batch_size) + return final_result + else: + result = result.to(torch.float16) + return result + +# The tuning functions below +def prune_configs(configs, nargs, **kwargs): + + def _ceil_div(x, y): + return (x + y - 1) // y + + def _prune(config): + _config = config.all_kwargs() + all_kwargs = {**_config, **kwargs, **nargs} + + BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K = all_kwargs["BLOCK_SIZE_M"], all_kwargs["BLOCK_SIZE_N"], all_kwargs["BLOCK_SIZE_K"] + num_stages = all_kwargs["num_stages"] + + if num_stages > 1 and (BLOCK_SIZE_M * BLOCK_SIZE_K + BLOCK_SIZE_K * BLOCK_SIZE_N) * 2 > 16384: + return True + + remained = [c for c in configs if not _prune(c)] + return remained + +def update_config(M, K, N, G, cfg): + config = cfg.copy() + # 根据基本的配置计算其他参数,一则用于 launch,一则避免 kernel 内重复计算 + config["NUM_GROUPS"] = (config["BLOCK_SIZE_K"] + G - 1) // G + if config["SCHEDULER"] == 0 and config["SPLITK"] == 1: + # dp + config["DP_TILES"] = 0 + config["DANGLING_TILES"] = 0 + config["D_SHAPE"] = (M, N) + config["D_DTYPE"] = 16 + elif config["SCHEDULER"] == 0 and config["SPLITK"] > 1: + # splitk + config["DP_TILES"] = 0 + config["DANGLING_TILES"] = 0 + config["D_DTYPE"] = 32 + config["D_SHAPE"] = (config["SPLITK"], M, N) if config["USE_REDUCE_KERNEL"] else (M, N) + else: + # streamk + tiles_M = (M + config["BLOCK_SIZE_M"] - 1) // config["BLOCK_SIZE_M"] + tiles_N = (N + config["BLOCK_SIZE_N"] - 1) // config["BLOCK_SIZE_N"] + total_tiles = tiles_M * tiles_N + dangling_tiles = max(0, total_tiles - config["NUM_CUS"]) % config["NUM_CUS"] + dp_tiles = total_tiles - dangling_tiles + if dangling_tiles == 0: + # redirect to dp + config["SCHEDULER"] = 0 + config["SPLITK"] = 1 + config["USE_REDUCE_KERNEL"] = 0 + config["DP_TILES"] = 0 + config["DANGLING_TILES"] = 0 + config["D_SHAPE"] = (M, N) + config["D_DTYPE"] = 16 + else: + # still streamk + config["DP_TILES"] = dp_tiles + config["DANGLING_TILES"] = dangling_tiles + iters_per_tile = (K + config["BLOCK_SIZE_K"] - 1) // config["BLOCK_SIZE_K"] + dangling_iters = iters_per_tile * config["DANGLING_TILES"] + dangling_iters_per_cu = (dangling_iters + config["NUM_CUS"] - 1) // config["NUM_CUS"] + num_cus_streamk = (dangling_iters + dangling_iters_per_cu - 1) // dangling_iters_per_cu + config["NUM_CUS_STREAMK"] = num_cus_streamk + num_cus_per_dangling_tile = (iters_per_tile + dangling_iters_per_cu - 1) // dangling_iters_per_cu + 1 + config["D_DTYPE"] = 32 + config["D_SHAPE"] = (num_cus_per_dangling_tile, M, N) if config["USE_REDUCE_KERNEL"] else (M, N) + + return config + +''' +@triton.autotune( + configs=[ + triton.Config({ + "BLOCK_SIZE_M": M, + "BLOCK_SIZE_N": N, + }, num_warps=num_warps, num_stages=num_stages) + for M in [128, 64, 32, 16] for N in [512, 128, 64, 32, 16]\ + for num_warps in [1, 2, 4, 8, 16] for num_stages in [1, 2] + ], + key=["M", "N", "batch_size"], + perf_debug=True +) +''' + +@triton.utils.jit(key=["M", "N", "batch_size", "input_ptr"]) +def awq_reduce_and_convert_kernel( + input_ptr, + output_ptr, + M, + N, + batch_size, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + + tl.assume(M >= 0) + tl.assume(N >= 0) + tl.assume(batch_size >= 0) + tl.assume(BLOCK_SIZE_M >= 0) + tl.assume(BLOCK_SIZE_N >= 0) + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + mask_m = offs_m < M + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for batch_idx in range(batch_size): + batch_offset = batch_idx * M * N + input_offsets = batch_offset + offs_m[:, None] * N + offs_n[None, :] + input_data = tl.load(input_ptr + input_offsets, mask=mask, other=0.0) + acc += input_data + + output_offsets = offs_m[:, None] * N + offs_n[None, :] + acc_f16 = acc.to(tl.float16) + tl.store(output_ptr + output_offsets, acc_f16, mask=mask) + +def awq_reduce_and_convert_triton( + input_tensor: torch.Tensor, + output_tensor: torch.Tensor, + M: int, + N: int, + batch_size: int = 1 +) -> None: + + grid = lambda META: ( + triton.cdiv(M, META['BLOCK_SIZE_M']), + triton.cdiv(N, META['BLOCK_SIZE_N']), + ) + + BLOCK_SIZE_M = 32 + BLOCK_SIZE_N = 128 + num_warps = 16 + + awq_reduce_and_convert_kernel[grid]( + input_tensor, + output_tensor, + M, + N, + batch_size, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + num_warps=num_warps + ) diff --git a/aiter/ops/triton/gemm_a8w8.py b/aiter/ops/triton/gemm_a8w8.py new file mode 100644 index 0000000000000000000000000000000000000000..229c299cb785e0427afbeb076485973e08199ccc --- /dev/null +++ b/aiter/ops/triton/gemm_a8w8.py @@ -0,0 +1,258 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_a8w8_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + bias_ptr, + # Matrix dimensions + M, + N, + K, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + # Meta-parameters + HAS_BIAS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call gemm_a8w8 function + below + + Computes the 8 bit matmul C = A x B, applies a conversion scale and optionally adds a bias to + the result. + The conversion scale is received in the form of two 1D tensors that are multiplied to form a + 2D one before being applied. + + Key parameters: + - A: Matrix A with shape (M, K). + - B: Matrix B with shape (K, N). + - C: Matrix C with shape (M, N). + - A_scale: First scale tensor with shape (M, 1). + - B_scale: Second scale tensor with shape (1, N). + - Bias: Bias tensor with shape (1, N). + """ + + NUM_XCDS: tl.constexpr = 8 + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + ## pid remapping on xcds + # Number of pids per XCD in the new arrangement + pids_per_xcd = (GRID_MN + NUM_XCDS - 1) // NUM_XCDS + # When GRID_MN cannot divide NUM_XCDS, some xcds will have + # pids_per_xcd pids, the other will have pids_per_xcd - 1 pids. + # We calculate the number of xcds that have pids_per_xcd pids as + # tall_xcds + tall_xcds = GRID_MN % NUM_XCDS + tall_xcds = NUM_XCDS if tall_xcds == 0 else tall_xcds + # Compute current XCD and local pid within the XCD + xcd = pid % NUM_XCDS + local_pid = pid // NUM_XCDS + # Calculate new pid based on the new grouping + # Note that we need to consider the following two cases: + # 1. the current pid is on a tall xcd + # 2. the current pid is on a short xcd + if xcd < tall_xcds: + pid = xcd * pids_per_xcd + local_pid + else: + pid = ( + tall_xcds * pids_per_xcd + + (xcd - tall_xcds) * (pids_per_xcd - 1) + + local_pid + ) + + if GROUP_SIZE_M == 1: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + else: + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + # Create pointers for first block of A and B input matrices + offs_k = tl.arange(0, BLOCK_SIZE_K) + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + # Create pointers for the scale tensors and load them + offs_a_scale = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) % M + offs_b_scale = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) % N + a_scale = tl.load(a_scale_ptr + offs_a_scale) + b_scale = tl.load(b_scale_ptr + offs_b_scale) + + acc_dtype = tl.float32 if c_ptr.type.element_ty != tl.int8 else tl.int32 + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs) + else: + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + + accumulator += tl.dot(a, b, input_precision="ieee") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + # Apply scale + accumulator *= a_scale[:, None] * b_scale[None, :] + + # Add bias + if HAS_BIAS: + offs_bias = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) % N + bias = tl.load(bias_ptr + offs_bias) + accumulator = accumulator.to(bias_ptr.type.element_ty) + bias[None, :] + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A8W8.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + return _get_config._config_dict["any"] + + +def gemm_a8w8( + x: torch.Tensor, + w: torch.Tensor, + x_scale: torch.Tensor, + w_scale: torch.Tensor, + bias: Optional[torch.Tensor] = None, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the 8 bit matmul Y = X x WT, applies a conversion scale and optionally adds a bias + to the result. + The conversion scale is received in the form of two 1D tensors that are multiplied to form a + 2D one before being applied. + + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - X_scale: First scale tensor with shape (M, 1). + - W_scale: Second scale tensor with shape (1, N). + - Bias: Bias tensor with shape (1, N). + - Y: Output Matrix Y with shape (M, K). If this is none, then it's created by this API and returned as output + + Returns: + - Y: The output matrix with shape (M, N). + """ + + # Check constraints. + assert x.shape[1] == w.shape[1], "Incompatible dimensions!!!" + + M, K = x.shape + N, K = w.shape + + # Transpose w (kernel expects (K, N)) + w = w.T + + if y is None: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + grid = ( + triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(N, config["BLOCK_SIZE_N"]), + ) + _gemm_a8w8_kernel[grid]( + x, + w, + y, + x_scale, + w_scale, + bias, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + y.stride(0), + y.stride(1), + bias is not None, + **config, + ) + + return y diff --git a/aiter/ops/triton/gemm_a8w8_blockscale.py b/aiter/ops/triton/gemm_a8w8_blockscale.py new file mode 100644 index 0000000000000000000000000000000000000000..d16918111c4e53873843935f9bfced030b38c1d2 --- /dev/null +++ b/aiter/ops/triton/gemm_a8w8_blockscale.py @@ -0,0 +1,278 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_a8w8_blockscale_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + # Matrix dimensions + M, + N, + K, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_ascale_m, + stride_ascale_k, + stride_bscale_k, + stride_bscale_n, + # Meta-parameters + GROUP_K: tl.constexpr, + GROUP_N: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call gemm_a8w8_blockscale function + below + + Computes the 8 bit matmul C = A x B using the block-scale quantization approach. + + Key parameters: + - A: Matrix A with shape (M, K). + - B: Matrix B with shape (K, N). + - C: Matrix C with shape (M, N). + - A_scale: Scale tensor for A with shape (M, *scale_k). + - B_scale: Scale tensor for B with shape (*scale_k, **scale_n). + + *scale_k = (K + GROUP_K - 1) // GROUP_K + **scale_n = (N + GROUP_N - 1) // GROUP_N + """ + + NUM_XCDS: tl.constexpr = 8 + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_ascale_m > 0) + tl.assume(stride_ascale_k > 0) + tl.assume(stride_bscale_k > 0) + tl.assume(stride_bscale_n > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + ## pid remapping on xcds + # Number of pids per XCD in the new arrangement + pids_per_xcd = (GRID_MN + NUM_XCDS - 1) // NUM_XCDS + # When GRID_MN cannot divide NUM_XCDS, some xcds will have + # pids_per_xcd pids, the other will have pids_per_xcd - 1 pids. + # We calculate the number of xcds that have pids_per_xcd pids as + # tall_xcds + tall_xcds = GRID_MN % NUM_XCDS + tall_xcds = NUM_XCDS if tall_xcds == 0 else tall_xcds + # Compute current XCD and local pid within the XCD + xcd = pid % NUM_XCDS + local_pid = pid // NUM_XCDS + # Calculate new pid based on the new grouping + # Note that we need to consider the following two cases: + # 1. the current pid is on a tall xcd + # 2. the current pid is on a short xcd + if xcd < tall_xcds: + pid = xcd * pids_per_xcd + local_pid + else: + pid = ( + tall_xcds * pids_per_xcd + + (xcd - tall_xcds) * (pids_per_xcd - 1) + + local_pid + ) + + if GROUP_SIZE_M == 1: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + else: + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + # Create pointers for first block of A and B input matrices + offs_k = tl.arange(0, BLOCK_SIZE_K) + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + # Create pointers for the scales + k_start = 0 + offs_ks = k_start // GROUP_K + a_scale_ptrs = a_scale_ptr + offs_am * stride_ascale_m + offs_ks * stride_ascale_k + offs_bsn = offs_bn // GROUP_N + b_scale_ptrs = b_scale_ptr + offs_ks * stride_bscale_k + offs_bsn * stride_bscale_n + + acc_dtype = tl.float32 if c_ptr.type.element_ty != tl.int8 else tl.int32 + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs) + else: + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + + a_scale = tl.load(a_scale_ptrs) + b_scale = tl.load(b_scale_ptrs) + + # Perform dot operation and apply scale + accumulator += ( + tl.dot(a, b, input_precision="ieee") * a_scale[:, None] * b_scale[None, :] + ) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + k_cur = k * BLOCK_SIZE_K // GROUP_K + k_nxt = (k + 1) * BLOCK_SIZE_K // GROUP_K + offs_ks = k_nxt - k_cur + a_scale_ptrs += offs_ks * stride_ascale_k + b_scale_ptrs += offs_ks * stride_bscale_k + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM_BLOCKSCALE-A8W8.json" + print(f"fpath={fpath}") + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + return _get_config._config_dict["any"] + + +def gemm_a8w8_blockscale( + x: torch.Tensor, + w: torch.Tensor, + x_scale: torch.Tensor, + w_scale: torch.Tensor, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the 8 bit matmul Y = X x WT using the block-scale quantization approach. + + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - X_scale: Scale tensor for X with shape (M, *scale_k). + - W_scale: Scale tensor for W with shape (**scale_n, *scale_k). + - Y: Output Matrix Y with shape (M, K). If this is none, then it's created by this API and returned as output + + Returns: + - Y: The output matrix with shape (M, N). + + *scale_k = (K + scale_block_size_k - 1) // scale_block_size_k + **scale_n = (N + scale_block_size_n - 1) // scale_block_size_n + """ + M, K = x.shape + N, K = w.shape + + # Check constraints. + assert x.shape[1] == w.shape[1], "Incompatible dimensions!!!" + + # Transpose w and w_scale + w = w.T + w_scale = w_scale.T + + if y is None: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + # Scale block sizes + # TODO: need a better way to pass scale block sizes around + config["GROUP_K"] = triton.next_power_of_2(triton.cdiv(K, w_scale.shape[0])) + config["GROUP_N"] = triton.next_power_of_2(triton.cdiv(N, w_scale.shape[1])) + + grid = lambda META: ( # noqa: E731 + ( + triton.cdiv(M, config["BLOCK_SIZE_M"]) + * triton.cdiv(N, config["BLOCK_SIZE_N"]), + ) + ) + _gemm_a8w8_blockscale_kernel[grid]( + x, + w, + y, + x_scale, + w_scale, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + y.stride(0), + y.stride(1), + x_scale.stride(0), + x_scale.stride(1), + w_scale.stride(0), + w_scale.stride(1), + **config, + ) + + return y diff --git a/aiter/ops/triton/gemm_a8wfp4.py b/aiter/ops/triton/gemm_a8wfp4.py new file mode 100644 index 0000000000000000000000000000000000000000..b6906e7f4d7bef9d0d54d87c9732a6db6e9ef144 --- /dev/null +++ b/aiter/ops/triton/gemm_a8wfp4.py @@ -0,0 +1,447 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + +global _USE_GEMM_SPLITK_BF16 +_USE_GEMM_SPLITK_BF16 = False + + +def set_use_gemm_splitk_bf16(value: bool): + global _USE_GEMM_SPLITK_BF16 + _USE_GEMM_SPLITK_BF16 = value + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_a8wfp4_kernel( + a_ptr, + b_ptr, + c_ptr, + a_scales_ptr, + b_scales_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_ck, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bsn, + stride_bsk, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, + RAW_MASKED_LOADS: tl.constexpr, + cache_modifier: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A is in fp8 e4m3 format. + B is in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_asm > 0) + tl.assume(stride_ask > 0) + tl.assume(stride_bsk > 0) + tl.assume(stride_bsn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_unified = tl.program_id(axis=0) + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if NUM_KSPLIT == 1: + remap_xcd(pid, GRID_MN) + + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE) < K: + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE, BLOCK_SIZE_K) + + # Set up base A offsets + offs_am_raw = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_am = offs_am_raw % M + + # Load A scales once (they're per-row) + a_scale_ptrs = a_scales_ptr + offs_am * stride_asm + if RAW_MASKED_LOADS: + a_scale_mask = offs_am < M + a_scales = tl.load(a_scale_ptrs, mask=a_scale_mask) + else: + a_scales = tl.load(a_scale_ptrs) + a_ones_scale = tl.full( + (BLOCK_SIZE_M, BLOCK_SIZE_K // SCALE_GROUP_SIZE), 127, dtype=tl.uint8 + ) # 1.0 in e8m0 + + # Set up base B offsets + offs_bn_raw = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_bn = offs_bn_raw % N + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, num_k_iter): + # Load A inside the loop with correct K offset + offs_ak = tl.arange(0, BLOCK_SIZE_K) + k * BLOCK_SIZE_K # Add k offset + offs_ak_split = pid_k * SPLITK_BLOCK_SIZE + offs_ak + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_ak_split[None, :] * stride_ak + ) + if RAW_MASKED_LOADS: + a_mask = (offs_am_raw[:, None] < M) & (offs_ak_split[None, :] < K) + a = tl.load(a_ptrs, mask=a_mask) + else: + a = tl.load(a_ptrs) + + # B loading stays mostly the same, but fix the offsets + offs_bk = tl.arange(0, BLOCK_SIZE_K // 2) + k * ( + BLOCK_SIZE_K // 2 + ) # Add k offset + offs_bk_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_bk + b_ptrs = b_ptr + ( + offs_bk_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn + ) + offs_ks = ( + (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE)) + + k * (BLOCK_SIZE_K // SCALE_GROUP_SIZE) + + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + ) + b_scale_ptrs = ( + b_scales_ptr + + offs_bn[:, None] * stride_bsn + + offs_ks[None, :] * stride_bsk + ) + if RAW_MASKED_LOADS: + b_k_mask = offs_bk_split[:, None] < (K // 2) + b_n_mask = offs_bn_raw[None, :] < N + b_mask = b_k_mask & b_n_mask + if EVEN_K: + b = tl.load(b_ptrs, mask=b_mask, cache_modifier=cache_modifier) + else: + b = tl.load(b_ptrs, mask=b_mask, other=0) + bs_k_mask = offs_ks[None, :] < (K // SCALE_GROUP_SIZE) + bs_n_scale_mask = offs_bn_raw[:, None] < N + bs_mask = bs_k_mask & bs_n_scale_mask + b_scales = tl.load(b_scale_ptrs, mask=bs_mask, other=0) + else: + if EVEN_K: + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + b_mask = offs_bk[:, None] < K - k * (BLOCK_SIZE_K // 2) + b = tl.load(b_ptrs, mask=b_mask, other=0) + b_scales = tl.load(b_scale_ptrs) + accumulator += tl.dot_scaled(a, a_ones_scale, "e4m3", b, b_scales, "e2m1") + + # Scale by a_scales at the end + c = (accumulator * a_scales[:, None]).to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_ptrs = ( + c_ptr + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@triton.jit +def _gemm_afp4_wfp4_reduce_kernel( + c_in_ptr, + c_out_ptr, + M, + N, + stride_c_in_k, + stride_c_in_m, + stride_c_in_n, + stride_c_out_m, + stride_c_out_n, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + ACTUAL_KSPLIT: tl.constexpr, + MAX_KSPLIT: tl.constexpr, +): + + pid_m = tl.program_id(axis=0) + pid_n = tl.program_id(axis=1) + + offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, MAX_KSPLIT) + c_in_ptrs = ( + c_in_ptr + + (offs_k[:, None, None] * stride_c_in_k) + + (offs_m[None, :, None] * stride_c_in_m) + + (offs_n[None, None, :] * stride_c_in_n) + ) + + if ACTUAL_KSPLIT == MAX_KSPLIT: + c = tl.load(c_in_ptrs) + else: + c = tl.load(c_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT) + c = tl.sum(c, axis=0) + + c = c.to(c_out_ptr.type.element_ty) + + c_out_ptrs = ( + c_out_ptr + + (offs_m[:, None] * stride_c_out_m) + + (offs_n[None, :] * stride_c_out_n) + ) + + tl.store(c_out_ptrs, c) + + +def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): + # heuristics for make "EVEN_K == True" as much as possible + NUM_KSPLIT_STEP = 4 + BLOCK_SIZE_K_STEP = 4 + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + while NUM_KSPLIT > 1 and BLOCK_SIZE_K > 16: + if ( + K % (SPLITK_BLOCK_SIZE // 2) == 0 + and SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0 + and K % (BLOCK_SIZE_K // 2) == 0 + ): + break + elif K % (SPLITK_BLOCK_SIZE // 2) != 0 and NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif SPLITK_BLOCK_SIZE % BLOCK_SIZE_K != 0: + if NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + elif K % (BLOCK_SIZE_K // 2) != 0 and BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + else: + break + + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + + return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A8WFP4.json" + + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + if M < 32: + config = _get_config._config_dict["M_LT_32"] + elif M == 32: + config = _get_config._config_dict["M_EQ_32"] + elif M <= 64: + config = _get_config._config_dict["M_33_64"] + elif M <= 128: + config = _get_config._config_dict["M_65_128"] + elif M <= 256: + config = _get_config._config_dict["M_129_256"] + else: + config = _get_config._config_dict["default"] + + if M <= 128: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + + return config + + +def gemm_a8wfp4( + x, + w, + y, + x_scales, + w_scales, + dtype: Optional[float] = torch.bfloat16, + config: Optional[dict] = None, +): + """ + Computes the matmul Y = X @ W.T (where W.T is the logical transpose of unpacked W) + + X is in fp8 e4m3 format. + W is in packed microscale fp4 (mxfp4) format, where 2 fp4 values are packed per uint8. + x_scales are in fp32 format (one scale per row of X). + w_scales are in e8m0 format (one scale per group of 32 elements in K dimension). + + Key parameters: + - x: Matrix X with shape (M, K) in fp8 e4m3 format + - w: Matrix W with shape (N, K//2) in packed fp4 format (2 values per uint8) + - y: Pre-allocated output matrix with shape (M, N) + - x_scales: Per-row scales for X with shape (M, 1) in fp32 format + - w_scales: Per-group scales for W with shape (N, K//32) in e8m0 format + - dtype: Output data type (default: torch.bfloat16) + + Returns: + - y: The output matrix with shape (M, N) containing X @ W.T + + Note: + - W is stored in packed format where each uint8 contains 2 fp4 values + - The logical shape of W after unpacking would be (N, K) + - Every 32 consecutive elements in the K dimension of W share one e8m0 scale + - X uses per-row scaling (not per-group scaling) + """ + M, K = x.shape + N, K_packed = w.shape + w = w.T + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + assert ( + K_packed == K // 2 + ), f"Inconsistent shapes: x has K={K} but w has K_packed={K_packed}, expected {K//2}" + assert x_scales.shape[0] == M and w_scales.shape == ( + N, + K // 32, + ), f"Scale shapes incorrect: x_scales should be ({M}, 1), got {x_scales.shape}; w_scales should be ({N}, {K//32}), got {w_scales.shape}" + + if config is None: + config = _get_config(M, N, K) + + if M <= 128: + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device + ) + else: + SPLITK_BLOCK_SIZE = 2 * K + y_pp = None + + grid = lambda META: ( # noqa: E731 + ( + config["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + + y_final = y if config["NUM_KSPLIT"] == 1 else y_pp + stride_am, stride_ak = x.stride() + stride_bk, stride_bn = w.stride() + stride_ck, stride_cm, stride_cn = ( + (0, y.stride(0), y.stride(1)) if config["NUM_KSPLIT"] == 1 else y_pp.stride() + ) + stride_asm, stride_ask = x_scales.stride() + stride_bsn, stride_bsk = w_scales.stride() + + _gemm_a8wfp4_kernel[grid]( + x, + w, + y_final, + x_scales, + w_scales, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_ck, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bsn, + stride_bsk, + RAW_MASKED_LOADS=True, + **config, + ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"])) + + grid_reduce = ( + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y.stride(0), + y.stride(1), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + config["NUM_KSPLIT"], + ) diff --git a/aiter/ops/triton/gemm_afp4wfp4.py b/aiter/ops/triton/gemm_afp4wfp4.py new file mode 100644 index 0000000000000000000000000000000000000000..ad069f66140cdfbefafc4b7a3ee00810098b15dd --- /dev/null +++ b/aiter/ops/triton/gemm_afp4wfp4.py @@ -0,0 +1,731 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import os +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.pid_preprocessing import ( + pid_grid, + remap_xcd, +) +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + +global _USE_GEMM_SPLITK_BF16 +_USE_GEMM_SPLITK_BF16 = False + + +def set_use_gemm_splitk_bf16(value: bool): + global _USE_GEMM_SPLITK_BF16 + _USE_GEMM_SPLITK_BF16 = value + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_afp4_wfp4_kernel( + a_ptr, + b_ptr, + c_ptr, + a_scales_ptr, + b_scales_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_ck, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bsn, + stride_bsk, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, + cache_modifier: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A and B inputs are in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_asm > 0) + tl.assume(stride_ask > 0) + tl.assume(stride_bsk > 0) + tl.assume(stride_bsn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_unified = tl.program_id(axis=0) + # remap so that XCDs get continous chunks of pids (of CHUNK_SIZE). + pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) + + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if NUM_KSPLIT == 1: + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + # Create pointers for first block of A and B input matrices + # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. + offs_k = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn + ) + # Create pointers for the first block of A and B scales + offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE)) + tl.arange( + 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) + a_scale_ptrs = ( + a_scales_ptr + offs_am[:, None] * stride_asm + offs_ks[None, :] * stride_ask + ) + # B scales are N x K even though B operand is K x N. + b_scale_ptrs = ( + b_scales_ptr + offs_bn[:, None] * stride_bsn + offs_ks[None, :] * stride_bsk + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + a_scales = tl.load(a_scale_ptrs) + b_scales = tl.load(b_scale_ptrs) + + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + b = tl.load( + b_ptrs, + mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), + other=0, + cache_modifier=cache_modifier, + ) + + accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") + + # Advance the ptrs to the next K block. + a_ptrs += (BLOCK_SIZE_K // 2) * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + a_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_ask + b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_ptrs = ( + c_ptr + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_afp4_wfp4_kernel_preshuffled_scales( + a_ptr, + b_ptr, + c_ptr, + a_scales_ptr, + b_scales_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_ck, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bsn, + stride_bsk, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, + cache_modifier: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A and B inputs are in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_asm > 0) + tl.assume(stride_ask > 0) + tl.assume(stride_bsk > 0) + tl.assume(stride_bsn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_unified = tl.program_id(axis=0) + pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if NUM_KSPLIT == 1: + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + # Create pointers for first block of A and B input matrices + # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. + offs_k = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn + ) + # Create pointers for the first block of A and B scales + + offs_asn = ( + pid_n * (BLOCK_SIZE_N // 32) + tl.arange(0, (BLOCK_SIZE_N // 32)) + ) % N + offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) * 32) + tl.arange( + 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE * 32 + ) + # B scales are N x K even though B operand is K x N. + b_scale_ptrs = ( + b_scales_ptr + + offs_asn[:, None] * stride_bsn + + offs_ks[None, :] * stride_bsk + ) + + if BLOCK_SIZE_M < 32: + offs_ks_non_shufl = ( + pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) + ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + a_scale_ptrs = ( + a_scales_ptr + + offs_am[:, None] * stride_asm + + offs_ks_non_shufl[None, :] * stride_ask + ) + else: + offs_asm = ( + pid_m * (BLOCK_SIZE_M // 32) + tl.arange(0, (BLOCK_SIZE_M // 32)) + ) % M + a_scale_ptrs = ( + a_scales_ptr + + offs_asm[:, None] * stride_asm + + offs_ks[None, :] * stride_ask + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + a_scales = tl.load(a_scale_ptrs) + b_scales = tl.load(b_scale_ptrs, cache_modifier=cache_modifier) + if BLOCK_SIZE_M >= 32: + a_scales = tl.reshape( + a_scales, (BLOCK_SIZE_M, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + ) + b_scales = tl.reshape( + b_scales, (BLOCK_SIZE_N, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + ) + + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + b = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + + accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") + + # Advance the ptrs to the next K block. + a_ptrs += (BLOCK_SIZE_K // 2) * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + if BLOCK_SIZE_M < 32: + a_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_ask + else: + a_scale_ptrs += BLOCK_SIZE_K * stride_ask + b_scale_ptrs += BLOCK_SIZE_K * stride_bsk + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_ptrs = ( + c_ptr + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask, cache_modifier=".wt") + + +@triton.jit +def _gemm_afp4_wfp4_reduce_kernel( + c_in_ptr, + c_out_ptr, + M, + N, + stride_c_in_k, + stride_c_in_m, + stride_c_in_n, + stride_c_out_m, + stride_c_out_n, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + ACTUAL_KSPLIT: tl.constexpr, + MAX_KSPLIT: tl.constexpr, +): + + pid_m = tl.program_id(axis=0) + pid_n = tl.program_id(axis=1) + + offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, MAX_KSPLIT) + c_in_ptrs = ( + c_in_ptr + + (offs_k[:, None, None] * stride_c_in_k) + + (offs_m[None, :, None] * stride_c_in_m) + + (offs_n[None, None, :] * stride_c_in_n) + ) + + if ACTUAL_KSPLIT == MAX_KSPLIT: + c = tl.load(c_in_ptrs) + else: + c = tl.load(c_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT) + c = tl.sum(c, axis=0) + + c = c.to(c_out_ptr.type.element_ty) + + c_out_ptrs = ( + c_out_ptr + + (offs_m[:, None] * stride_c_out_m) + + (offs_n[None, :] * stride_c_out_n) + ) + + tl.store(c_out_ptrs, c) + + +def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): + # heuristics for make "EVEN_K == True" as much as possible + NUM_KSPLIT_STEP = 2 + BLOCK_SIZE_K_STEP = 2 + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + while NUM_KSPLIT > 1 and BLOCK_SIZE_K > 16: + if ( + K % (SPLITK_BLOCK_SIZE // 2) == 0 + and SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0 + and K % (BLOCK_SIZE_K // 2) == 0 + ): + break + elif K % (SPLITK_BLOCK_SIZE // 2) != 0 and NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif SPLITK_BLOCK_SIZE % BLOCK_SIZE_K != 0: + if NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + elif K % (BLOCK_SIZE_K // 2) != 0 and BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + else: + break + + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + + return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-AFP4WFP4.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + key = f"{N}_{K}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = ( + f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-AFP4WFP4-N={N}-K={2*K}.json" + ) + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = "default" # fall back to default config + + if M < 32: + return _get_config._config_dict[key]["small"] + elif M <= 128: + BLK_M = triton.next_power_of_2(M) + if BLK_M == 32: + return _get_config._config_dict[key]["medium_M32"] + elif BLK_M == 64: + return _get_config._config_dict[key]["medium_M64"] + elif BLK_M == 128: + return _get_config._config_dict[key]["medium_M128"] + elif M <= 256: + return _get_config._config_dict[key]["large"] + else: + return _get_config._config_dict[key]["xlarge"] + + +def gemm_afp4wfp4( + x, + w, + x_scales, + w_scales, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul Y = X x W + X and W are e2m1 fp4 tensors. + x_scales and w_scales are e8m0 tensors. + Every 32 elements in the K dimension share one e8m0 scale. + + + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - X_scales: Matrix with shape (M, K // 32) + - W_scales: Matrix with shape (N, K // 32) + + Returns: + - Y: The output matrix with shape (M, N). + """ + + M, K = x.shape + N, K = w.shape + + # Transpose w + w = w.T + + if y is None: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + + _gemm_afp4_wfp4_kernel[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + x_scales, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), + y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), + y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), + x_scales.stride(0), + x_scales.stride(1), + w_scales.stride(0), + w_scales.stride(1), + **config, + ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y.stride(0), + y.stride(1), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + config["NUM_KSPLIT"], + ) + + return y + + +def gemm_afp4wfp4_preshuffled_scales( + x, + w, + x_scales, + w_scales, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul Y = X x W + X and W are e2m1 fp4 tensors. + x_scales and w_scales are e8m0 tensors. + Every 32 elements in the K dimension share one e8m0 scale. + + + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - X_scales: Matrix with shape (M // 32, K) + - W_scales: Matrix with shape (N // 32, K) + + Returns: + - Y: The output matrix with shape (M, N). + """ + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + M, K = x.shape + N, K = w.shape + + # Transpose w + w = w.T + + if y is None: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + + config["BLOCK_SIZE_N"] = max(config["BLOCK_SIZE_N"], 32) + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + + _gemm_afp4_wfp4_kernel_preshuffled_scales[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + x_scales, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), + y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), + y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), + x_scales.stride(0), + x_scales.stride(1), + w_scales.stride(0), + w_scales.stride(1), + **config, + ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y.stride(0), + y.stride(1), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + config["NUM_KSPLIT"], + ) + + return y diff --git a/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py new file mode 100644 index 0000000000000000000000000000000000000000..7a07d629808c69bb848563cd33dd9db45467ba01 --- /dev/null +++ b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py @@ -0,0 +1,319 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import os +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter.ops.triton.quant import _mxfp4_quant_op + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit +def _gemm_afp4_wfp4_pre_quant_kernel( + a_ptr, + b_ptr, + c_ptr, + b_scales_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_ck, + stride_cm, + stride_cn, + stride_bsn, + stride_bsk, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + GRID_MN: tl.constexpr, + cache_modifier: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A and B inputs are in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_bsk > 0) + tl.assume(stride_bsn > 0) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_unified = tl.program_id(axis=0) + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if NUM_KSPLIT == 1: + remap_xcd(pid, GRID_MN) + + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + # Create pointers for first block of A and B input matrices + # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. + offs_k_bf16 = tl.arange(0, BLOCK_SIZE_K) + offs_k_split_bf16 = pid_k * SPLITK_BLOCK_SIZE + offs_k_bf16 + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k_split_bf16[None, :] * stride_ak + ) + + offs_k = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + b_ptrs = b_ptr + ( + offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn + ) + # Create pointers for the first block of A and B scales + offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE)) + tl.arange( + 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) + # B scales are N x K even though B operand is K x N. + b_scale_ptrs = ( + b_scales_ptr + offs_bn[:, None] * stride_bsn + offs_ks[None, :] * stride_bsk + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + b_scales = tl.load(b_scale_ptrs) + # a_scales = tl.full((BLOCK_SIZE_M, BLOCK_SIZE_K//SCALE_GROUP_SIZE), 127, dtype=tl.uint8) + # b_scales = tl.full((BLOCK_SIZE_N, BLOCK_SIZE_K//SCALE_GROUP_SIZE), 127, dtype=tl.uint8) + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a_bf16 = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a_bf16 = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0 + ) + b = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + + a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) + + accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk + + c = accumulator.to(c_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_ptrs = ( + c_ptr + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.atomic_add(c_ptrs, c, mask=c_mask, sem="relaxed") + + +def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): + # heuristics for make "EVEN_K == True" as much as possible + NUM_KSPLIT_STEP = 2 + BLOCK_SIZE_K_STEP = 2 + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + while NUM_KSPLIT > 1 and BLOCK_SIZE_K > 16: + if ( + K % (SPLITK_BLOCK_SIZE // 2) == 0 + and SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0 + and K % (BLOCK_SIZE_K // 2) == 0 + ): + break + elif K % (SPLITK_BLOCK_SIZE // 2) != 0 and NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif SPLITK_BLOCK_SIZE % BLOCK_SIZE_K != 0: + if NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + elif K % (BLOCK_SIZE_K // 2) != 0 and BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + else: + break + + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + + return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM_PREQUANT-AFP4WFP4.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + key = f"{N}_{K}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM_PREQUANT-AFP4WFP4-N={N}-K={2*K}.json" + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = "default" # fall back to default config + + if M < 32: + config = _get_config._config_dict[key]["small"] + elif M <= 128: + BLK_M = triton.next_power_of_2(M) + if BLK_M == 32: + config = _get_config._config_dict[key]["medium_M32"] + elif BLK_M == 64: + config = _get_config._config_dict[key]["medium_M64"] + elif BLK_M == 128: + config = _get_config._config_dict[key]["medium_M128"] + elif M <= 256: + config = _get_config._config_dict[key]["large"] + else: + config = _get_config._config_dict[key]["xlarge"] + + config = config.copy() + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + + return config + + +def gemm_afp4wfp4_pre_quant( + x, + w, + w_scales, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul Y = X x W + W is an e2m1 fp4 tensor and w_scales is an e8m0 tensor. + Every 32 elements in the K dimension share one e8m0 scale. + X gets quantized to the microscale fp4 (mxfp4) format before the GEMM. + + + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - X_scales: Matrix with shape (M, K // 32) + - W_scales: Matrix with shape (N, K // 32) + + Returns: + - Y: The output matrix with shape (M, N). + """ + + M, K = x.shape + N, K = w.shape + + # inner kernel expects (K, N) + w = w.T + + if y is None: + y = torch.zeros((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + _gemm_afp4_wfp4_pre_quant_kernel[grid]( + x, + w, + y, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + 0, + y.stride(0), + y.stride(1), + w_scales.stride(0), + w_scales.stride(1), + **config, + ) + + return y diff --git a/aiter/ops/triton/gemm_allreduce_a16w4.py b/aiter/ops/triton/gemm_allreduce_a16w4.py new file mode 100644 index 0000000000000000000000000000000000000000..640818af411786d46354bdb26f0bd75ac6682c60 --- /dev/null +++ b/aiter/ops/triton/gemm_allreduce_a16w4.py @@ -0,0 +1,1930 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +import json +import logging +import functools +from functools import partial +from typing import Any, Dict, List, Optional, Tuple +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import aiter.ops.triton.utils.arch_info as arch_info +from aiter import logger + +import torch +import triton +import triton.language as tl + +from triton.language.extra.hip import libdevice +from triton.language.extra import libshmem_device +import numpy as np +import torch.distributed as dist + +AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] + +def reverse_awq_order(tensor: torch.Tensor) -> torch.Tensor: + """Reverse the AWQ order of the given tensor. + + Args: + tensor: Input tensor to reorder + + Returns: + Reordered tensor with bits masked to 4 bits + """ + bits = 4 + AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] + reverse_order_tensor = torch.arange( + tensor.shape[-1], + dtype=torch.int32, + device=tensor.device, + ) + reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits) + reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER] + reverse_order_tensor = reverse_order_tensor.view(-1) + + tensor = tensor[:, reverse_order_tensor] & 0xF + return tensor + +def awq_reorder_and_repack( + qweight: torch.Tensor, + qzeros: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Reorder and pack weights and zeros using AWQ order. + + This function unpacks the 4-bit quantized weights and zeros from int32, + applies reverse_awq_order to reorder them, and then packs them. + For weight, repack to [N, K//2] + For zeros, repack to [K//G, N//2] + Args: + qweight: Quantized weight tensor of shape [K, N // 8] with dtype int32 + qzeros: Quantized zero points tensor of shape [K // G, N // 8] with dtype int32 + + Returns: + Tuple of (reordered_qweight, reordered_qzeros) both with dtype int8 + """ + bits = 4 + shifts = torch.arange(0, 32, bits, device=qweight.device) + K = qweight.shape[0] + N = qweight.shape[1] * 8 + G = K // qzeros.shape[0] + + # Unpack weights: [K, N//8] -> [K, N//8, 8] -> [K, N] + iweights = torch.bitwise_right_shift( + qweight[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + iweights = iweights.view(K, -1) + + # Unpack zeros: [K//G, N//8] -> [K//G, N//8, 8] -> [K//G, N] + zeros = torch.bitwise_right_shift( + qzeros[:, :, None], + shifts[None, None, :], + ).to(torch.int8) + zeros = zeros.view(K//G, -1) + + # Apply reverse AWQ order to both tensors + iweights = reverse_awq_order(iweights) + zeros = reverse_awq_order(zeros) + + # Mask to 4 bits + iweights = torch.bitwise_and(iweights, (2**bits) - 1) + zeros = torch.bitwise_and(zeros, (2**bits) - 1) + + # Repack weight to int32 and pack along the K direction + # [K, N] -> [N, K] + iweights = iweights.transpose(1, 0).contiguous() + # Reshape to [N, K//2, 2] for weights + iweights_packed = iweights.view(N, -1, 2) + + # Repack zeros to int8 and pack along the N direction + # Reshape to [K//G, N//2, 2] for zeros + zeros_packed = zeros.view(K//G, -1, 2) + + # Pack 2 int4 values into int8 using bit shifts + # Direct packing: pack in the order they appear after reordering + packed_weights = torch.zeros([N, K//2], dtype=torch.int8, device=qweight.device) + packed_zeros = torch.zeros([K//G, N//2], dtype=torch.int8, device=zeros.device) + + for i in range(2): + packed_weights |= (iweights_packed[:, :, i].to(torch.int8) << (i * bits)) + packed_zeros |= (zeros_packed[:, :, i].to(torch.int8) << (i * bits)) + + return packed_weights, packed_zeros + +''' +@triton.autotune( + configs=[ + triton.Config({ + "BLOCK_SIZE_N": BN, + "BLOCK_SIZE_K": BK + }, num_warps=num_warps, num_stages=num_stages) + for BN in [16, 32, 64, 128, 256] + for BK in [16, 32, 64, 128, 256] + for num_warps in [1, 2, 4, 8, 16] for num_stages in [1, 2] + ], + key=["K", "N"], + perf_debug=True, +) +''' +@triton.heuristics(values={ + "NUM_GROUPS": lambda args: triton.cdiv(args["BLOCK_SIZE_K"], args["group_size"]), + "BLOCK_SIZE_K2": lambda args: args["BLOCK_SIZE_K"] // 2 +}) +@triton.jit +def awq_dequantize_kernel( + qweight_ptr, # quantized matrix + scales_ptr, # scales, per group + zeros_ptr, # zeros, per group + result_ptr, # Output matrix + N, + N2, + K, + K2, + group_size: tl.constexpr, # Should always be one of the supported group sizes + NUM_GROUPS: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + BLOCK_SIZE_K2: tl.constexpr): + + # Setup the pids. + pid_n = tl.program_id(axis=0) + pid_k = tl.program_id(axis=1) + + tl.assume(pid_n >= 0) + tl.assume(pid_k >= 0) + tl.assume(N > 0) + tl.assume(K > 0) + tl.assume(N2 > 0) + tl.assume(K2 > 0) + tl.assume(BLOCK_SIZE_N > 0) + tl.assume(BLOCK_SIZE_K > 0) + tl.assume(BLOCK_SIZE_K2 > 0) + tl.assume(group_size > 0) + + # Compute offsets and masks for qweight_ptr. + offsets_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offsets_n = tl.max_contiguous(tl.multiple_of(offsets_n, BLOCK_SIZE_N), BLOCK_SIZE_N) + offsets_k = pid_k * BLOCK_SIZE_K2 + tl.arange(0, BLOCK_SIZE_K2) + offsets_k = tl.max_contiguous(tl.multiple_of(offsets_k, BLOCK_SIZE_K2), BLOCK_SIZE_K2) + offsets = K2 * offsets_n[:, None] + offsets_k[None, :] + + masks_n = offsets_n < N + masks_k = offsets_k < K2 + + masks = masks_n[:, None] & masks_k[None, :] + + # Compute offsets and masks for result output ptr. + result_offsets_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + result_offsets_k = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + result_offsets = (N * result_offsets_k[:, None] + result_offsets_n[None, :]) # [K, N] + + result_masks_n = result_offsets_n < N + result_masks_k = result_offsets_k < K + result_masks = result_masks_k[:, None] & result_masks_n[None, :] + + # Load the weights. + iweights = tl.load(qweight_ptr + offsets, masks, 0.0) #[BLOCK_SIZE_N, BLOCK_SIZE_K//2] + iweights = tl.interleave(iweights, iweights) # [BLOCK_SIZE_N, BLOCK_SIZE_K] + + # Use this to compute a set of shifts that can be used to unpack and + # reorder the values in iweights and zeros. + shifts = tl.arange(0, 2) * 4 + bshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_N * BLOCK_SIZE_K2, 2)) + bshifts = tl.reshape(bshifts, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + # Unpack and reorder: shift out the correct 4-bit value and mask. + iweights = (iweights >> bshifts) & 0xF + + # Compute zero offsets and masks. + zero_offsets_k = pid_k * BLOCK_SIZE_K // group_size + tl.arange(0, NUM_GROUPS) + zero_offsets_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + zero_offsets_n2 = zero_offsets_n // 2 + zero_offsets = N2 * zero_offsets_k[:, None] + zero_offsets_n2[None, :] + + zero_masks_k = zero_offsets_k < K//group_size + zero_masks_n = zero_offsets_n < N + zero_masks = zero_masks_k[:, None] & zero_masks_n[None, :] + + # Load the zeros. + zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0) # [NUM_GROUPS, BLOCK_SIZE_N] + + # Compute scale offsets and masks. + scale_offsets_k = pid_k * BLOCK_SIZE_K // group_size + tl.arange(0, NUM_GROUPS) + scale_offsets_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) + scale_offsets = N * scale_offsets_k[:, None] + scale_offsets_n[None, :] + scale_masks_k = scale_offsets_k < K//group_size + scale_masks_n = scale_offsets_n < N + scale_masks = scale_masks_k[:, None] & scale_masks_n[None, :] + + # Load the scales. + scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0) # [NUM_GROUPS, BLOCK_SIZE_N] + + if NUM_GROUPS == 1: + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) # [BLOCK_SIZE_K, BLOCK_SIZE_N] + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) # [BLOCK_SIZE_K, BLOCK_SIZE_N] + else: + zeros = tl.broadcast_to(zeros[:, None, :], (NUM_GROUPS, group_size, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales[:, None, :], (NUM_GROUPS, group_size, BLOCK_SIZE_N)) + zeros = tl.reshape(zeros, [BLOCK_SIZE_K, BLOCK_SIZE_N]) + scales = tl.reshape(scales, [BLOCK_SIZE_K, BLOCK_SIZE_N]) + + # Unpack and reorder: shift out the correct 4-bit value and mask. + zshifts = (zero_offsets_n[None, :] % 2) * 4 # [1, BLOCK_SIZE_N] + zeros = (zeros >> zshifts) & 0xF # [BLOCK_SIZE_K, BLOCK_SIZE_N] + + # Dequantize. + iweights = (iweights.T - zeros) * scales + iweights = iweights.to(result_ptr.type.element_ty) + + # Finally, store. + tl.store(result_ptr + result_offsets, iweights, result_masks) + +@triton.jit +def awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, not_reduce, GROUP_SIZE: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr = 0): + + if not USE_REDUCE_KERNEL: + k_idx = 0 + + tl.assume(tile_idx >= 0) + tl.assume(k_idx >= 0) + tl.assume(iter_begin >= 0) + tl.assume(iter_end >= 0) + tl.assume(M > 0) + tl.assume(N > 0) + tl.assume(K > 0) + tl.assume(K2 > 0) + tl.assume(N2 > 0) + + num_tile_m = tl.cdiv(M, BLOCK_SIZE_M) + num_tile_n = tl.cdiv(N, BLOCK_SIZE_N) + + tile_idx_m = tile_idx // num_tile_n + tile_idx_n = tile_idx % num_tile_n + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7] + # that will map given indices to the correct order. + #reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]).reshape(8) + + # Create the necessary shifts to use to unpack. + #shifts = reverse_awq_order_tensor * 4 + shifts = tl.arange(0, 2) * 4 + + #zshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_K * (BLOCK_SIZE_N // 2), 2)) + #zshifts = tl.reshape(zshifts, (BLOCK_SIZE_K, BLOCK_SIZE_N)).T + + #bshifts = tl.broadcast_to(shifts[:, None], (8, (BLOCK_SIZE_K // 8) * BLOCK_SIZE_N)) + bshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_N * (BLOCK_SIZE_K // 2), 2)) + bshifts = tl.reshape(bshifts, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + # Offsets and masks. + offsets_am = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + masks_am = offsets_am < M + + #offsets_zn = tile_idx_n * (BLOCK_SIZE_N // 2) + tl.arange(0, BLOCK_SIZE_N // 2) + offsets_zn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offsets_zn = offsets_zn // 2 + #offsets_zn = tl.max_contiguous(tl.multiple_of(offsets_zn, BLOCK_SIZE_N // 2), BLOCK_SIZE_N // 2) + masks_zn = offsets_zn < N2 + + offsets_bn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + #offsets_bn = tl.max_contiguous(tl.multiple_of(offsets_bn, BLOCK_SIZE_N), BLOCK_SIZE_N) + masks_bn = offsets_bn < N + + offsets_sn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + #offsets_sn = tl.max_contiguous(tl.multiple_of(offsets_sn, BLOCK_SIZE_N), BLOCK_SIZE_N) + masks_sn = offsets_sn < N + + offsets_ak = iter_begin * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + #offsets_ak = tl.max_contiguous(tl.multiple_of(offsets_ak, BLOCK_SIZE_K), BLOCK_SIZE_K) + offsets_a = K * offsets_am[:, None] + offsets_ak[None, :] + + offsets_bk = iter_begin * (BLOCK_SIZE_K // 2) + tl.arange(0, BLOCK_SIZE_K // 2) + #offsets_bk = tl.max_contiguous(tl.multiple_of(offsets_bk, BLOCK_SIZE_K // 2), BLOCK_SIZE_K // 2) + #offsets_b = offsets_bk[:, None] + K // 2 * offsets_bn[None, :] + #offsets_b = K // 2 * offsets_bn[:, None] + offsets_bk[None, :] + offsets_b = K2 * offsets_bn[:, None] + offsets_bk[None, :] + zshifts = (offsets_bn[:, None] % 2) * 4 # [N, 1] + zshifts = zshifts.T + + a_ptrs = a_ptr + offsets_a + b_ptrs = b_ptr + offsets_b + for k in range(iter_end - iter_begin): + masks_ak = offsets_ak < K + masks_bk = offsets_bk < K2 + masks_a = masks_am[:, None] & masks_ak[None, :] + masks_b = masks_bn[:, None] & masks_bk[None, :] + other_bzs = 0.0 + a = tl.load(a_ptrs, mask=masks_a, other=0.) + b = tl.load(b_ptrs, masks_b, other_bzs) #[N, K//2] + b = tl.interleave(b, b) # [N, K] + + # Dequantize b. + offsets_szk = ((BLOCK_SIZE_K * k + iter_begin * BLOCK_SIZE_K) // GROUP_SIZE + tl.arange(0, NUM_GROUPS)) + masks_szk = offsets_szk < K // GROUP_SIZE + masks_z = masks_szk[:, None] & masks_zn[None, :] + masks_s = masks_szk[:, None] & masks_sn[None, :] + #masks_z = masks_zn[:, None] & masks_szk[None, :] + #masks_s = masks_sn[:, None] & masks_szk[None, :] + + offsets_z = N2 * offsets_szk[:, None] + offsets_zn[None, :] + #offsets_z = K // GROUP_SIZE * offsets_zn[:, None] + offsets_szk[None, :] + zeros_ptrs = zeros_ptr + offsets_z + zeros = tl.load(zeros_ptrs, mask=masks_z, other=other_bzs) # [K//G, N] + #zshifts = (offsets_bn[:, None] % 2) * 4 # [N, 1] + #zeros = (zeros >> _zshifts) & 0xF # [N, K//G] + + ''' + zeros = zeros.T # [K//G, N//2] + zeros = tl.interleave(zeros, zeros) # [K//G, N] + zeros = zeros.T # [N, K//G] + ''' + + offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :] + #offsets_s = K // GROUP_SIZE * offsets_sn[:, None] + offsets_szk[None, :] + scales_ptrs = scales_ptr + offsets_s + scales = tl.load(scales_ptrs, mask=masks_s, other=other_bzs) # [K//G, N] + + if NUM_GROUPS == 1: + # Original efficient implementation for single group + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + #zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + #scales = tl.broadcast_to(scales, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + else: + # Reshape to (NUM_GROUPS, 1, N) then broadcast to (NUM_GROUPS, group_size_in_block, N) + # Reshape to (K//G, 1, N) then broadcast to (K//G, group_size_in_block, N) + zeros = tl.broadcast_to(zeros[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + ## Reshape back to (BLOCK_SIZE_K, N) + zeros = tl.reshape(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.reshape(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + # Reshape to (N, K//G, 1) then broadcast to (N, K//G, group_size_in_block) + #zeros = tl.broadcast_to(zeros[:, :, None], (BLOCK_SIZE_N, NUM_GROUPS, GROUP_SIZE)) + #scales = tl.broadcast_to(scales[:, :, None], (BLOCK_SIZE_N, NUM_GROUPS, GROUP_SIZE)) + #zeros = tl.reshape(zeros, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + #scales = tl.reshape(scales, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + b = (b >> bshifts) & 0xF + b = b.T + zeros = (zeros >> zshifts) & 0xF + b = (b - zeros) * scales + b = b.to(a_ptr.type.element_ty) + + # Accumulate results. + accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32) + + offsets_ak += BLOCK_SIZE_K + offsets_bk += BLOCK_SIZE_K // 2 + a_ptrs += BLOCK_SIZE_K + b_ptrs += BLOCK_SIZE_K // 2 + + c = accumulator.to(c_ptr.type.element_ty) + offs_cm = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # compiler hints + offs_cm = tl.max_contiguous(tl.multiple_of(offs_cm, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_cn = tl.max_contiguous(tl.multiple_of(offs_cn, BLOCK_SIZE_N), BLOCK_SIZE_N) + offs_c = M * N * k_idx + N * offs_cm[:, None] + offs_cn[None, :] + c_ptrs = c_ptr + offs_c + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + if USE_REDUCE_KERNEL: + tl.store(c_ptrs, c, mask=c_mask) + else: + if c_ptr.type.element_ty == tl.float16: + tl.store(c_ptrs, c, mask=c_mask) + elif not_reduce: + tl.store(c_ptrs, c, mask=c_mask) + else: + tl.atomic_add(c_ptrs, c, mask=c_mask) + +@triton.jit +def awq_gemm_scatter_kernel_inner( + a_ptr, b_ptr, + c_ptr, + zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, not_reduce, + cur_rank, + world_size, + GROUP_SIZE: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr = 0): + + if not USE_REDUCE_KERNEL: + k_idx = 0 + + tl.assume(tile_idx >= 0) + tl.assume(k_idx >= 0) + tl.assume(iter_begin >= 0) + tl.assume(iter_end >= 0) + tl.assume(M > 0) + tl.assume(N > 0) + tl.assume(K > 0) + tl.assume(K2 > 0) + tl.assume(N2 > 0) + + num_tile_m = tl.cdiv(M, BLOCK_SIZE_M) + num_tile_n = tl.cdiv(N, BLOCK_SIZE_N) + + tile_idx_m = tile_idx // num_tile_n + tile_idx_n = tile_idx % num_tile_n + + # NOTE: this may be used when M is large + # currently M range is small (1~128), so does M_PER_RANK + # M_PER_RANK = M // world_size + # num_tile_m_per_rank = M_PER_RANK // BLOCK_SIZE_M + # rank_offset = tile_idx_m // num_tile_m_per_rank + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + shifts = tl.arange(0, 2) * 4 + bshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_N * (BLOCK_SIZE_K // 2), 2)) + bshifts = tl.reshape(bshifts, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + # Offsets and masks. + offsets_am = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + masks_am = offsets_am < M + + offsets_zn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offsets_zn = offsets_zn // 2 + masks_zn = offsets_zn < N2 + + offsets_bn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + masks_bn = offsets_bn < N + + offsets_sn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + masks_sn = offsets_sn < N + + offsets_ak = iter_begin * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + offsets_a = K * offsets_am[:, None] + offsets_ak[None, :] + + offsets_bk = iter_begin * (BLOCK_SIZE_K // 2) + tl.arange(0, BLOCK_SIZE_K // 2) + offsets_b = K2 * offsets_bn[:, None] + offsets_bk[None, :] + zshifts = (offsets_bn[:, None] % 2) * 4 # [N, 1] + zshifts = zshifts.T + + a_ptrs = a_ptr + offsets_a + b_ptrs = b_ptr + offsets_b + for k in range(iter_end - iter_begin): + masks_ak = offsets_ak < K + masks_bk = offsets_bk < K2 + masks_a = masks_am[:, None] & masks_ak[None, :] + masks_b = masks_bn[:, None] & masks_bk[None, :] + other_bzs = 0.0 + a = tl.load(a_ptrs, mask=masks_a, other=0.) + b = tl.load(b_ptrs, masks_b, other_bzs) #[N, K//2] + b = tl.interleave(b, b) # [N, K] + + # Dequantize b. + offsets_szk = ((BLOCK_SIZE_K * k + iter_begin * BLOCK_SIZE_K) // GROUP_SIZE + tl.arange(0, NUM_GROUPS)) + masks_szk = offsets_szk < K // GROUP_SIZE + masks_z = masks_szk[:, None] & masks_zn[None, :] + masks_s = masks_szk[:, None] & masks_sn[None, :] + + offsets_z = N2 * offsets_szk[:, None] + offsets_zn[None, :] + zeros_ptrs = zeros_ptr + offsets_z + zeros = tl.load(zeros_ptrs, mask=masks_z, other=other_bzs) # [K//G, N] + + offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :] + #offsets_s = K // GROUP_SIZE * offsets_sn[:, None] + offsets_szk[None, :] + scales_ptrs = scales_ptr + offsets_s + scales = tl.load(scales_ptrs, mask=masks_s, other=other_bzs) # [K//G, N] + + if NUM_GROUPS == 1: + # Original efficient implementation for single group + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + else: + # Reshape to (NUM_GROUPS, 1, N) then broadcast to (NUM_GROUPS, group_size_in_block, N) + # Reshape to (K//G, 1, N) then broadcast to (K//G, group_size_in_block, N) + zeros = tl.broadcast_to(zeros[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + ## Reshape back to (BLOCK_SIZE_K, N) + zeros = tl.reshape(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.reshape(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + b = (b >> bshifts) & 0xF + b = b.T + zeros = (zeros >> zshifts) & 0xF + b = (b - zeros) * scales + b = b.to(a_ptr.type.element_ty) + + # Accumulate results. + accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32) + + offsets_ak += BLOCK_SIZE_K + offsets_bk += BLOCK_SIZE_K // 2 + a_ptrs += BLOCK_SIZE_K + b_ptrs += BLOCK_SIZE_K // 2 + + c = accumulator.to(c_ptr.type.element_ty) + + # offs_cm = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + # NOTE: plus cur_rank offset + offs_cm = M * cur_rank + tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + # compiler hints + offs_cm = tl.max_contiguous(tl.multiple_of(offs_cm, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_cn = tl.max_contiguous(tl.multiple_of(offs_cn, BLOCK_SIZE_N), BLOCK_SIZE_N) + + # batch index + offs_c = world_size * M * N * k_idx + N * offs_cm[:, None] + offs_cn[None, :] + c_mask = (offs_cm[:, None] < M * world_size) & (offs_cn[None, :] < N) + + # scatter to other ranks + for j in tl.range(world_size, flatten=True): + peer_rank = (j + cur_rank + 1) % world_size # swizzle + c_ptr_peer = libshmem_device.remote_ptr(c_ptr, peer_rank).to(tl.pointer_type(c_ptr.type.element_ty)) + c_ptrs = c_ptr_peer + offs_c + tl.store(c_ptrs, c, mask=c_mask) + +@triton.jit +def awq_gemm_kernel_streamk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, NUM_CUS: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, + NUM_GROUPS: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + + if pid < DP_TILES: + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K) + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, pid, 0, 0, iters_per_cta, + M, N, N2, K, K2, True, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + else: + iters_per_tile = tl.cdiv(K, BLOCK_SIZE_K) + total_iters = iters_per_tile * DANGLING_TILES + iters_per_cta = tl.cdiv(total_iters, NUM_CUS) + + iter_begin = (pid - DP_TILES) * iters_per_cta + iter_end = tl.minimum(iter_begin + iters_per_cta, total_iters) + + while iter_begin < iter_end: + tile_idx = iter_begin // iters_per_tile + DP_TILES + tile_iter_begin = (tile_idx - DP_TILES) * iters_per_tile + tile_iter_end = tile_iter_begin + iters_per_tile + local_iter_begin = iter_begin - tile_iter_begin + local_iter_end = tl.minimum(iter_end, tile_iter_end) - tile_iter_begin + k_idx = tl.cdiv(local_iter_begin, iters_per_cta) + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, local_iter_begin, local_iter_end, + M, N, N2, K, K2, False, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + iter_begin = tile_iter_end + +@triton.jit +def awq_gemm_kernel_splitk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, NUM_CUS: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + SPLITK: tl.constexpr, NUM_GROUPS: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + + tiles_M = tl.cdiv(M, BLOCK_SIZE_M) + tiles_N = tl.cdiv(N, BLOCK_SIZE_N) + total_tiles = tiles_M * tiles_N + tile_idx = pid % total_tiles + + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K * SPLITK) + iter_begin = pid // total_tiles * iters_per_cta + iter_end = iter_begin + iters_per_cta + k_idx = tl.cdiv(iter_begin, iters_per_cta) + + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, False, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + +@triton.jit +def awq_gemm_kernel_splitk_fused( + a_ptr, b_ptr, c_ptr, + zeros_ptr, scales_ptr, + out_ptr, barrier_ptr, + M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + SPLITK: tl.constexpr, + NUM_GROUPS: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + tiles_M = tl.cdiv(M, BLOCK_SIZE_M) + tiles_N = tl.cdiv(N, BLOCK_SIZE_N) + total_tiles = tiles_M * tiles_N + + if pid < total_tiles * SPLITK: + tile_idx = pid % total_tiles + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K * SPLITK) + iter_begin = pid // total_tiles * iters_per_cta + iter_end = iter_begin + iters_per_cta + k_idx = tl.cdiv(iter_begin, iters_per_cta) + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, False, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + # set barriers + tile_idx_m = tile_idx // tiles_N + tile_idx_n = tile_idx % tiles_N + offset = total_tiles * k_idx + tiles_N * tile_idx_m + tile_idx_n + tl.store(barrier_ptr + offset, 1, cache_modifier=".wt") + else: + pid = pid - total_tiles * SPLITK + # reduce kernel + pid_m = pid // tiles_N + pid_n = pid % tiles_N + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # compiler hints + offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_n = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N) + + mask_m = offs_m < M + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + # reduce split k + for batch_idx in range(SPLITK): + batch_offset = batch_idx * M * N + input_offsets = batch_offset + offs_m[:, None] * N + offs_n[None, :] + # wait barrier + offset = total_tiles * batch_idx + pid_m * tiles_N + pid_n + while tl.load(barrier_ptr + offset, cache_modifier=".cv", volatile=True) != 1: + pass + input_data = tl.load(c_ptr + input_offsets, mask=mask, other=0.0) + acc += input_data + + output_offsets = offs_m[:, None] * N + offs_n[None, :] + acc_f16 = acc.to(tl.float16) + tl.store(out_ptr + output_offsets, acc_f16, mask=mask) + +@triton.jit +def awq_gemm_allreduce_kernel_splitk( + a_ptr, b_ptr, c_ptr, + zeros_ptr, scales_ptr, + out_ptr, barrier_ptr, + M, N, N2, K, K2, + cur_rank, world_size, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + SPLITK: tl.constexpr, + NUM_GROUPS: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + tiles_M = tl.cdiv(M, BLOCK_SIZE_M) + tiles_N = tl.cdiv(N, BLOCK_SIZE_N) + total_tiles = tiles_M * tiles_N + + if pid < total_tiles * SPLITK: + tile_idx = pid % total_tiles + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K * SPLITK) + iter_begin = pid // total_tiles * iters_per_cta + iter_end = iter_begin + iters_per_cta + k_idx = tl.cdiv(iter_begin, iters_per_cta) + c_ptr_remote = tl.load(c_ptr + cur_rank, cache_modifier=".cg").to(tl.pointer_type(tl.float32)) # new added + awq_gemm_kernel_inner(a_ptr, b_ptr, c_ptr_remote, zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, False, GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + # set barriers + tile_idx_m = tile_idx // tiles_N + tile_idx_n = tile_idx % tiles_N + offset = total_tiles * k_idx + tiles_N * tile_idx_m + tile_idx_n + barrier_ptr_remote = tl.load(barrier_ptr + cur_rank).to(tl.pointer_type(tl.int32)) # new added + tl.store(barrier_ptr_remote + offset, 1) + else: + pid = pid - total_tiles * SPLITK + # reduce kernel + pid_m = pid // tiles_N + pid_n = pid % tiles_N + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # compiler hints + offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_n = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N) + + mask_m = offs_m < M + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + # reduce split k + # for batch_idx in range(SPLITK): + # batch_offset = batch_idx * M * N + # input_offsets = batch_offset + offs_m[:, None] * N + offs_n[None, :] + # # wait barrier + # offset = total_tiles * batch_idx + pid_m * tiles_N + pid_n + # while tl.load(barrier_ptr + offset, cache_modifier=".cv", volatile=True) != 1: + # pass + # input_data = tl.load(c_ptr + input_offsets, mask=mask, other=0.0) + # acc += input_data + + for i in tl.range(world_size, flatten=True): + # swizzle (always compute cur_rank 1st) + rank_id = (i + cur_rank) % world_size + # remote_barrier_ptr = libshmem_device.remote_ptr(barrier_ptr, rank_id).to(tl.pointer_type(tl.int32)) + # remote_c_ptr = libshmem_device.remote_ptr(c_ptr, rank_id).to(tl.pointer_type(c_ptr.type.element_ty)) + remote_barrier_ptr = tl.load(barrier_ptr + rank_id).to(tl.pointer_type(tl.int32)) + remote_c_ptr = tl.load(c_ptr + rank_id).to(tl.pointer_type(tl.float32)) + for batch_idx in range(SPLITK): + batch_offset = batch_idx * M * N + input_offsets = batch_offset + offs_m[:, None] * N + offs_n[None, :] + # wait barrier + offset = total_tiles * batch_idx + pid_m * tiles_N + pid_n + while tl.load(remote_barrier_ptr + offset, cache_modifier=".cv", volatile=True) != 1: + pass + input_data = tl.load(remote_c_ptr + input_offsets, mask=mask, other=0.0, volatile=True) + acc += input_data + + output_offsets = offs_m[:, None] * N + offs_n[None, :] + acc_f16 = acc.to(tl.float16) + tl.store(out_ptr + output_offsets, acc_f16, mask=mask) + +@triton.jit +def awq_gemm_scatter_kernel_splitk( + a_ptr, b_ptr, + scatter_ptr, + zeros_ptr, scales_ptr, + M, N, N2, K, K2, + cur_rank, + world_size, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + SPLITK: tl.constexpr, + NUM_GROUPS: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr): + + pid = tl.program_id(axis=0) + tiles_M = tl.cdiv(M, BLOCK_SIZE_M) + tiles_N = tl.cdiv(N, BLOCK_SIZE_N) + total_tiles = tiles_M * tiles_N + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K * SPLITK) + + tile_idx = pid % total_tiles + iter_begin = pid // total_tiles * iters_per_cta + iter_end = iter_begin + iters_per_cta + k_idx = tl.cdiv(iter_begin, iters_per_cta) + + # TODO: expand this func + awq_gemm_scatter_kernel_inner( + a_ptr, b_ptr, scatter_ptr, + zeros_ptr, scales_ptr, tile_idx, k_idx, iter_begin, iter_end, + M, N, N2, K, K2, False, + cur_rank, + world_size, + GROUP_SIZE, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, + NUM_GROUPS, USE_REDUCE_KERNEL) + +''' +@triton.autotune( + configs=[ + triton.Config({ + }, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8, 16] for num_stages in [1, 2] + ], + key=["M", "K", "N", "GROUP_SIZE", "BLOCK_SIZE_M", "BLOCK_SIZE_N", "BLOCK_SIZE_K", "SCHEDULER", "SPLITK"], + perf_debug=True, + #enable=int(os.getenv("TRITON_DO_AUTOTUNING", 0)) == 1, + #prune_configs_by={ + # "early_config_prune": lambda configs, nargs, **kwargs: [ + # config for config in configs + # # SCHEDULE=1 代表 STREAMK,不需要遍历那么多 SPLITK 的值 + # if config.all_kwargs()["SCHEDULER"] == 0 or (config.all_kwargs()["SCHEDULER"] == 1 and config.all_kwargs()["SPLITK"] == 1) + # ] + #} +) +@triton.heuristics(values={ + "NUM_GROUPS": lambda args: triton.cdiv(args["BLOCK_SIZE_K"], args["GROUP_SIZE"]) +}) +''' +@triton.jit +def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, + GROUP_SIZE: tl.constexpr, NUM_CUS: tl.constexpr, BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, + SPLITK: tl.constexpr, SCHEDULER: tl.constexpr, USE_REDUCE_KERNEL: tl.constexpr): + if SCHEDULER == 0: + return awq_gemm_kernel_splitk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, GROUP_SIZE, NUM_CUS, BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + SPLITK, NUM_GROUPS, USE_REDUCE_KERNEL) + else: + return awq_gemm_kernel_streamk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, GROUP_SIZE, NUM_CUS, BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + DP_TILES, DANGLING_TILES, NUM_GROUPS, USE_REDUCE_KERNEL) + +@triton.jit +def awq_gemm_kernel_fused( + a_ptr, b_ptr, c_ptr, + zeros_ptr, + scales_ptr, + out_ptr, barrier_ptr, + M, N, N2, K, K2, + cur_rank, world_size, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + # NUM_GEMM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, + SPLITK: tl.constexpr, SCHEDULER: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr): + if SCHEDULER == 0: + # return awq_gemm_kernel_splitk_fused( + # a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, + # out_ptr, barrier_ptr, + # M, N, N2, K, K2, + # GROUP_SIZE, + # NUM_CUS, + # # NUM_GEMM_CUS, + # BLOCK_SIZE_M, + # BLOCK_SIZE_N, BLOCK_SIZE_K, + # SPLITK, NUM_GROUPS, USE_REDUCE_KERNEL) + return awq_gemm_allreduce_kernel_splitk( + a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, + out_ptr, barrier_ptr, + M, N, N2, K, K2, + cur_rank, world_size, + GROUP_SIZE, + NUM_CUS, + # NUM_GEMM_CUS, + BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + SPLITK, NUM_GROUPS, USE_REDUCE_KERNEL) + else: + # TODO: to be supported + assert False + return awq_gemm_kernel_streamk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, GROUP_SIZE, NUM_CUS, BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + DP_TILES, DANGLING_TILES, NUM_GROUPS, USE_REDUCE_KERNEL) + +# refer from dist/04-tutorial +@triton.jit +def tile_id_to_index_range( + tile_id, + M, + N, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + + group_id = tile_id // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + + tile_in_group = tile_id % num_pid_in_group + pid_m = first_pid_m + (tile_in_group % group_size_m) + pid_n = tile_in_group // group_size_m + + rm_start = pid_m * BLOCK_SIZE_M + rn_start = pid_n * BLOCK_SIZE_N + + # clamp to the maximum valid index (M-1, N-1) + max_m = M - 1 + max_n = N - 1 + + # generate indices + rm = rm_start + tl.arange(0, BLOCK_SIZE_M) + rn = rn_start + tl.arange(0, BLOCK_SIZE_N) + + rm = tl.minimum(rm, max_m) + rn = tl.minimum(rn, max_n) + + return rm, rn, rm_start, rn_start + +@triton.jit +def offset_for_tile(local_tile_id, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M, M_local, N_local): + rm, rn, rm_start, rn_start = tile_id_to_index_range( + local_tile_id, M_local, N_local, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M + ) + c_mask = (rm[:, None] < M_local) & (rn[None, :] < N_local) + return rm, rn, c_mask, rm_start, rn_start + +@triton.jit +def extract_submask_and_offset( + M_local, + N_local, + rm, + rn, + mask, + rm_start, + rn_start, + start_row, + start_col, + SUB_BLOCK_SIZE_M: tl.constexpr, + SUB_BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + stride_cm_local: tl.constexpr, + stride_cn_local: tl.constexpr, +): + # Create indices for the sub-block + sub_rm = tl.arange(0, SUB_BLOCK_SIZE_M) + start_row + sub_rn = tl.arange(0, SUB_BLOCK_SIZE_N) + start_col + + # clamp to the maximum valid index (M-1, N-1) + sub_rm = tl.minimum(sub_rm, M_local - 1) + sub_rn = tl.minimum(sub_rn, N_local - 1) + + # Create a 2D grid of indices for the sub-block + sub_rm_2d = sub_rm[:, None] # Shape: (SUB_BLOCK_SIZE_M, 1) + sub_rn_2d = sub_rn[None, :] # Shape: (1, SUB_BLOCK_SIZE_N) + + # Compute the sub-mask + sub_mask = (sub_rm_2d < BLOCK_SIZE_M) & (sub_rn_2d < BLOCK_SIZE_N) + + # Compute the sub-offset relative to the start of the tile + sub_offset = ((rm_start + sub_rm_2d) * stride_cm_local) + ((rn_start + sub_rn_2d) * stride_cn_local) + + return sub_mask, sub_offset + +# GROUP_SIZE_M swizzle +@triton.jit +def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M): + group_id = tile_id // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + # pid_m = first_pid_m + (tile_id % group_size_m) + pid_m = first_pid_m + ((tile_id % num_pid_in_group) % group_size_m) + pid_n = (tile_id % num_pid_in_group) // group_size_m + return pid_m, pid_n + +@triton.jit +def awq_gemm_allreduce_oneshot_kernel( + shm_ctx, + a_ptr, b_ptr, c_ptr, + scatter_bufs_ptr, barrier_bufs_ptr, + zeros_ptr, scales_ptr, + M, N, N2, K, K2, + cur_rank, world_size, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + SPLITK: tl.constexpr, + NUM_GROUPS: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, SCHEDULER: tl.constexpr,): + + # set shmem device ctx (rocSHMEM required when call libshmem.func) + # libshmem_device.set_rocshmem_ctx(shm_ctx) + + start_pid = tl.program_id(axis=0) + + tiles_M = tl.cdiv(M, BLOCK_SIZE_M) + tiles_N = tl.cdiv(N, BLOCK_SIZE_N) + total_tiles = tiles_M * tiles_N + # in case split K > 1 + # tile_idx = pid % total_tiles + + iters_per_cta = tl.cdiv(K, BLOCK_SIZE_K) + iter_begin = start_pid // total_tiles * iters_per_cta + # iter_end = iter_begin + iters_per_cta + # k_idx = tl.cdiv(iter_begin, iters_per_cta) + k_idx = 0 + loop_k = tl.cdiv(K, BLOCK_SIZE_K) + + # tl.assume(tile_idx >= 0) + # tl.assume(k_idx >= 0) + tl.assume(iter_begin >= 0) + # tl.assume(iter_end >= 0) + tl.assume(M > 0) + tl.assume(N > 0) + tl.assume(K > 0) + tl.assume(K2 > 0) + tl.assume(N2 > 0) + + num_tile_m = tl.cdiv(M, BLOCK_SIZE_M) + num_tile_n = tl.cdiv(N, BLOCK_SIZE_N) + + # group size M swizzle + # num_pid_in_group = GROUP_SIZE_M * num_tile_n + # tile_idx_m, tile_idx_n = _compute_pid(start_pid, num_pid_in_group, num_tile_m, GROUP_SIZE_M) + + # no swizzle impl + tile_idx_m = start_pid // num_tile_n + tile_idx_n = start_pid % num_tile_n + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + shifts = tl.arange(0, 2) * 4 + + bshifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_N * (BLOCK_SIZE_K // 2), 2)) + bshifts = tl.reshape(bshifts, (BLOCK_SIZE_N, BLOCK_SIZE_K)) + + # Offsets and masks. + offsets_am = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + masks_am = offsets_am < M + + #offsets_zn = tile_idx_n * (BLOCK_SIZE_N // 2) + tl.arange(0, BLOCK_SIZE_N // 2) + offsets_zn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offsets_zn = offsets_zn // 2 + masks_zn = offsets_zn < N2 + + offsets_bn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + masks_bn = offsets_bn < N + + offsets_sn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + masks_sn = offsets_sn < N + + offsets_ak = iter_begin * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + offsets_a = K * offsets_am[:, None] + offsets_ak[None, :] + + offsets_bk = iter_begin * (BLOCK_SIZE_K // 2) + tl.arange(0, BLOCK_SIZE_K // 2) + offsets_b = K2 * offsets_bn[:, None] + offsets_bk[None, :] + zshifts = (offsets_bn[:, None] % 2) * 4 # [N, 1] + zshifts = zshifts.T + + a_ptrs = a_ptr + offsets_a + b_ptrs = b_ptr + offsets_b + # for k in range(iter_end - iter_begin): + for k in range(0, loop_k): + masks_ak = offsets_ak < K + masks_bk = offsets_bk < K2 + masks_a = masks_am[:, None] & masks_ak[None, :] + masks_b = masks_bn[:, None] & masks_bk[None, :] + other_bzs = 0.0 + a = tl.load(a_ptrs, mask=masks_a, other=0.) + b = tl.load(b_ptrs, masks_b, other_bzs) #[N, K//2] + b = tl.interleave(b, b) # [N, K] + + # Dequantize b. + offsets_szk = ((BLOCK_SIZE_K * k) // GROUP_SIZE + tl.arange(0, NUM_GROUPS)) + masks_szk = offsets_szk < K // GROUP_SIZE + masks_z = masks_szk[:, None] & masks_zn[None, :] + masks_s = masks_szk[:, None] & masks_sn[None, :] + + offsets_z = N2 * offsets_szk[:, None] + offsets_zn[None, :] + zeros_ptrs = zeros_ptr + offsets_z + zeros = tl.load(zeros_ptrs, mask=masks_z, other=other_bzs) # [K//G, N] + + offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :] + scales_ptrs = scales_ptr + offsets_s + scales = tl.load(scales_ptrs, mask=masks_s, other=other_bzs) # [K//G, N] + + if NUM_GROUPS == 1: + # Original efficient implementation for single group + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + else: + zeros = tl.broadcast_to(zeros[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + scales = tl.broadcast_to(scales[:, None, :], (NUM_GROUPS, GROUP_SIZE, BLOCK_SIZE_N)) + ## Reshape back to (BLOCK_SIZE_K, N) + zeros = tl.reshape(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + scales = tl.reshape(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + b = (b >> bshifts) & 0xF + b = b.T + zeros = (zeros >> zshifts) & 0xF + b = (b - zeros) * scales + b = b.to(a_ptr.type.element_ty) + + # Accumulate results. + accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32) + + offsets_ak += BLOCK_SIZE_K + offsets_bk += BLOCK_SIZE_K // 2 + a_ptrs += BLOCK_SIZE_K + b_ptrs += BLOCK_SIZE_K // 2 + + # c = accumulator.to(c_ptr.type.element_ty) + c = accumulator.to(tl.float32) + + offs_cm = tile_idx_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = tile_idx_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # compiler hints + offs_cm = tl.max_contiguous(tl.multiple_of(offs_cm, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_cn = tl.max_contiguous(tl.multiple_of(offs_cn, BLOCK_SIZE_N), BLOCK_SIZE_N) + offs_c = M * N * k_idx + N * offs_cm[:, None] + offs_cn[None, :] + # c_ptrs = c_ptr + offs_c + scatter_buf_ptr = tl.load(scatter_bufs_ptr + cur_rank).to(tl.pointer_type(tl.float32)) + c_ptrs = scatter_buf_ptr + offs_c + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + tl.store(c_ptrs, c, mask=c_mask) + + # Signal to other ranks that we have completed this tile + for i in range(1, world_size): + remote = (cur_rank + i) % world_size + remote_base_ptr = tl.load(barrier_bufs_ptr + remote).to(tl.pointer_type(tl.int32)) + # remote_base_ptr = libshmem_device.remote_ptr(barrier_bufs_ptr, remote).to(tl.pointer_type(tl.int32)) + tl.atomic_add(remote_base_ptr + start_pid, 1, scope="sys", sem="release") + + # consumer + local_base_ptr = tl.load(barrier_bufs_ptr + cur_rank).to(tl.pointer_type(tl.int32)) + while tl.atomic_cas(local_base_ptr + start_pid, world_size - 1, 0, scope="sys", sem="acquire") != (world_size - 1): + pass + + # acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + acc = accumulator + for i in range(1, world_size): + rank_id = (cur_rank + i) % world_size + scatter_buf_ptr = tl.load(scatter_bufs_ptr + rank_id).to(tl.pointer_type(tl.float32)) + # scatter_buf_ptr = libshmem_device.remote_ptr(scatter_bufs_ptr, rank_id).to(tl.pointer_type(tl.float32)) + acc += tl.load(scatter_buf_ptr + offs_c, mask=c_mask) + acc_f16 = acc.to(tl.float16) + tl.store(c_ptr + offs_c, acc_f16, mask=c_mask, cache_modifier=".wt") + +# Fused-Sequential impl of gemm-scatter +# Each rank produce the partial gemm output and push to the peer rank +@triton.jit +def awq_gemm_scatter_kernel( + a_ptr, b_ptr, scatter_ptr, + zeros_ptr, + scales_ptr, + M, N, N2, K, K2, + cur_rank, + world_size, + GROUP_SIZE: tl.constexpr, + NUM_CUS: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + NUM_GROUPS: tl.constexpr, + DP_TILES: tl.constexpr, DANGLING_TILES: tl.constexpr, + SPLITK: tl.constexpr, SCHEDULER: tl.constexpr, + USE_REDUCE_KERNEL: tl.constexpr): + if SCHEDULER == 0: + return awq_gemm_scatter_kernel_splitk( + a_ptr, b_ptr, scatter_ptr, + zeros_ptr, scales_ptr, + M, N, N2, K, K2, + cur_rank, + world_size, + GROUP_SIZE, + NUM_CUS, + # NUM_GEMM_CUS, + BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + SPLITK, NUM_GROUPS, USE_REDUCE_KERNEL) + else: + # TODO: to be supported + assert False + return awq_gemm_kernel_streamk(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, N2, K, K2, GROUP_SIZE, NUM_CUS, BLOCK_SIZE_M, + BLOCK_SIZE_N, BLOCK_SIZE_K, + DP_TILES, DANGLING_TILES, NUM_GROUPS, USE_REDUCE_KERNEL) + + +# qweights - [N , K // 2], int8 +# scales - [K // G, N ], float16 +# zeros - [K // G, N // 2], int8 +# result - [K, N], float16 +def awq_dequantize_triton(qweight: torch.Tensor, + scales: torch.Tensor, + zeros: torch.Tensor, + **kwargs) -> torch.Tensor: + N = qweight.shape[0] + K = qweight.shape[1] * 2 + group_size = K // scales.shape[0] + + assert K > 0 and N > 0 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert zeros.shape[0] == K // group_size and zeros.shape[1] == N // 2 + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + configs = { + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "num_warps": 4, + "num_stages": 1 + } + + result = torch.empty(K, N, device=qweight.device, dtype=scales.dtype) + + grid = lambda META: ( + triton.cdiv(N, META['BLOCK_SIZE_N']), + triton.cdiv(K, META['BLOCK_SIZE_K']), + ) + awq_dequantize_kernel[grid](qweight, + scales, + zeros, + result, + N, + N//2, + K, + K//2, + group_size, + **configs + ) + + return result + +@functools.lru_cache +def get_w4a16_awq_gemm_config_filepath(N: int, K: int, GROUP_SIZE: int, **kwargs) -> str: + device_name = arch_info.get_device() + if device_name.lower().startswith("bw"): + device_name = "BW200" + if "k100" in device_name.lower(): + device_name = "K100_AI" + json_file_name = f"awq_gemm_N={N},K={K},device_name={device_name},dtype=w4a16,group_size={GROUP_SIZE}.json" + + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "gemm/awq_w4a16", json_file_name + ) + return config_file_path + +@functools.lru_cache +def get_w4a16_awq_gemm_configs( + N: int, K: int, GROUP_SIZE: int +) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + config_file_path = get_w4a16_awq_gemm_config_filepath(N, K, GROUP_SIZE) + if os.path.exists(config_file_path): + # print(f"Teng config_file_path: {config_file_path}") + with open(config_file_path) as f: + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + f"\nUsing default W4A16 AWQ GEMM kernel config. Performance might " + f"be sub-optimal! Config file not found at {config_file_path}") + return None + +# The inference function +# input - [m, k] +# qweight - [n, k // 2] +# qzeros - [k//g, n//2] +# scales - [k//g, n] +def gemm_a16w4(input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + not_used_placeholder: int = 0, + configs: Optional[Dict] = None) -> torch.tensor: + + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + group_size = K // qzeros.shape[0] + + default_config = { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SCHEDULER": 0, + "SPLITK": 1, + "D_SHAPE": (M, N), + "D_DTYPE": 16, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "NUM_CUS": 0, + "NUM_CUS_STREAMK": 0, + "NUM_GROUPS":(32 + group_size - 1) // group_size, + "USE_REDUCE_KERNEL": False + } + if configs is None: + configs = get_w4a16_awq_gemm_configs(N, K, group_size) + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] if configs else default_config + # Make sure not getting this wrong from other configs + d_shape = list(config["D_SHAPE"]) + d_shape[-2] = M + config["D_SHAPE"] = d_shape + # if config["SPLITK"] > 1 and config["USE_REDUCE_KERNEL"]: + # return awq_gemm_triton_fused_impl( + # input, qweight, scales, qzeros, config, config.copy(), awq_gemm_kernel_fused) + + return awq_gemm_triton_impl(input, qweight, scales, qzeros, config, config.copy(), awq_gemm_kernel) + +def awq_gemm_triton_fused_impl( + input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + config: Dict, + cfg4kernel: Dict, + func) -> torch.tensor: + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + assert(qweight.is_contiguous()) + group_size = qweight.shape[1] * 2 // qzeros.shape[0] + + assert N > 0 and K > 0 and M > 0 + assert qweight.shape[1] == K // 2 and qweight.shape[0] == N + assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 2 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + num_cus = config["NUM_CUS"] + d_shape = config["D_SHAPE"] + d_dtype = config["D_DTYPE"] + d_dtype = torch.float16 if d_dtype == 16 else torch.float32 + + def grid(META): + # tiles_M = (M + META["BLOCK_SIZE_M"] - 1) // META["BLOCK_SIZE_M"] + # tiles_N = (N + META["BLOCK_SIZE_N"] - 1) // META["BLOCK_SIZE_N"] + tiles_M = triton.cdiv(M, META["BLOCK_SIZE_M"]) + tiles_N = triton.cdiv(N, META["BLOCK_SIZE_N"]) + total_tiles = tiles_M * tiles_N + if META["SCHEDULER"] == 0: + # dp or splitk + # add extra total_tiles for reduction + return (total_tiles * META["SPLITK"] + total_tiles,) + else: + # TODO: not supported yet + # streamk + return (META["DP_TILES"] + config["NUM_CUS_STREAMK"],) + + result = torch.zeros(d_shape, dtype=d_dtype, device=input.device) + + cfg4kernel.pop("D_SHAPE", None) + cfg4kernel.pop("D_DTYPE", None) + cfg4kernel.pop("NUM_CUS_STREAMK", None) + + fn = func[grid] + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + fn = partial(func.warmup, grid=grid) + + total_tiles_splitk = \ + triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(N, config["BLOCK_SIZE_N"]) * config["SPLITK"] + final_result = torch.zeros((M, N), dtype=torch.float16, device=input.device) + barrier = torch.zeros((total_tiles_splitk, ), dtype=torch.int, device=input.device) + + fn(input, + qweight, + result, + qzeros, + scales, + final_result, # new added + barrier, # new added + M, + N, + N//2, + K, + K//2, + group_size, + **cfg4kernel) + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + return + + return final_result + +def awq_gemm_triton_impl(input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + config: Dict, + cfg4kernel: Dict, + func) -> torch.tensor: + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + assert(qweight.is_contiguous()) + group_size = qweight.shape[1] * 2 // qzeros.shape[0] + + assert N > 0 and K > 0 and M > 0 + assert qweight.shape[1] == K // 2 and qweight.shape[0] == N + assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 2 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + # print("SPLITK:", config["SPLITK"]) + # print("REDUCE:", config["USE_REDUCE_KERNEL"]) + + num_cus = config["NUM_CUS"] + d_shape = config["D_SHAPE"] + d_dtype = config["D_DTYPE"] + d_dtype = torch.float16 if d_dtype == 16 else torch.float32 + + #curr_num_cus = torch.cuda.get_device_properties("cuda").multi_processor_count + #if num_cus > 0 and num_cus != curr_num_cus: + # print("AWQ_GEMM config tuned based on num_cus={num_cus}, but now running on num_cus={curr_num_cus}, may lead to bad performance!") + + def grid(META): + tiles_M = (M + META["BLOCK_SIZE_M"] - 1) // META["BLOCK_SIZE_M"] + tiles_N = (N + META["BLOCK_SIZE_N"] - 1) // META["BLOCK_SIZE_N"] + total_tiles = tiles_M * tiles_N + if META["SCHEDULER"] == 0: + # dp or splitk + return (total_tiles * META["SPLITK"],) + else: + # streamk + return (META["DP_TILES"] + config["NUM_CUS_STREAMK"],) + + result = torch.zeros(d_shape, dtype=d_dtype, device=input.device) + + cfg4kernel.pop("D_SHAPE", None) + cfg4kernel.pop("D_DTYPE", None) + cfg4kernel.pop("NUM_CUS_STREAMK", None) + + fn = func[grid] + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + fn = partial(func.warmup, grid=grid) + + fn(input, + qweight, + result, + qzeros, + scales, + M, + N, + N//2, + K, + K//2, + group_size, + **cfg4kernel) + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + return + + if result.ndim == 3: + batch_size = result.shape[0] + final_result = torch.zeros((M, N), dtype=torch.float16, device=input.device) + awq_reduce_and_convert_triton(result, final_result, M, N, batch_size) + return final_result + else: + result = result.to(torch.float16) + return result + +# awq gemm fused with allreduce +def gemm_allreduce_a16w4( + input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + barrier_buf: torch.tensor, + scatter_buf: torch.tensor, + max_tiles: int, + tp_group: dist.ProcessGroup, + shmem_ctx: Optional[np.intp] = None, + configs: Optional[Dict] = None) -> torch.tensor: + + # get from tp_group + cur_rank = tp_group.rank() + world_size = tp_group.size() + + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + group_size = K // qzeros.shape[0] + + default_config = { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "SCHEDULER": 0, + "SPLITK": 1, + "D_SHAPE": (M, N), + "D_DTYPE": 16, + "DP_TILES": 0, + "DANGLING_TILES": 0, + "NUM_CUS": 0, + "NUM_CUS_STREAMK": 0, + "NUM_GROUPS":(32 + group_size - 1) // group_size, + "USE_REDUCE_KERNEL": False + } + if configs is None: + configs = get_w4a16_awq_gemm_configs(N, K, group_size) + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] if configs else default_config + + cur_total_tiles = \ + triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(N, config["BLOCK_SIZE_N"]) + + assert cur_total_tiles <= max_tiles, ( + f"{cur_total_tiles=} should not be bypass {max_tiles=}, " + "please check the min BLOCK_SIZE_M & BLOCK_SIZE_N in awq_gemm config and pass them to register_shmem") + + output, compiled_kernel = awq_gemm_allreduce_triton_oneshot_impl( + shmem_ctx, + input, qweight, scales, qzeros, + barrier_buf, + scatter_buf, + cur_rank, world_size, + config) + + # NOTE: this seems confilct with hip stream with this error when using vLLM: + # [nvSHMEM/src/host/coll/barrier/barrier.cpp:21] cuda failed with operation + # not permitted when stream is capturing + # so do barrier outside in vLLM using vLLM api + # if world_size > 1: + # dist.barrier() + # pynvshmem.nvshmem_barrier_all() + + # if cur_rank == 0: + # logger.info(f"triton kernel regs: {compiled_kernel.n_regs}, spills: {compiled_kernel.n_spills}") + return output + +@triton.jit +def barrier_all_ipc(rank, num_ranks, comm_buf_ptr): + tid = libdevice.thread_idx(axis=0) # noqa: F841 + for i in range(num_ranks): + remote_base_ptr = libshmem_device.remote_ptr(comm_buf_ptr, i).to(tl.pointer_type(tl.int32)) + while tl.atomic_cas(remote_base_ptr + rank, 0, 1, scope="sys", sem="release") != 0: + pass + + for i in range(num_ranks): + while tl.atomic_cas(comm_buf_ptr + i, 1, 0, scope="sys", sem="acquire") != 1: + pass + + tl.debug_barrier() + +def barrier_all_on_stream( + rank, + num_ranks, + sync_buf_ptr, + stream, +): + with torch.cuda.stream(stream): + barrier_all_ipc[(1, )](rank, num_ranks, sync_buf_ptr) + + +# scatter to other ranks +# scatter buffer +# [a0, # Rank 0 -> [a0 + b0] +# a1, +# b0, # Rank 1 -> [a1 + b1] +# b1] +@triton.jit +def local_reduce_kernel( + input_ptr, + output_ptr, + M, N, + batch_size, + cur_rank, + world_size, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + + tl.assume(M >= 0) + tl.assume(N >= 0) + tl.assume(batch_size >= 0) + tl.assume(BLOCK_SIZE_M >= 0) + tl.assume(BLOCK_SIZE_N >= 0) + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + mask_m = offs_m < (M * world_size) + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for batch_idx in range(batch_size): + batch_offset = batch_idx * M * N * world_size + input_offsets = batch_offset + offs_m[:, None] * N + offs_n[None, :] + # reduce the data from each rank + for rank_id in range(world_size): + rank_offset = M * N * rank_id + input_data = tl.load(input_ptr + input_offsets + rank_offset, mask=mask, other=0.0) + acc += input_data + + # store mask + mask_m = offs_m < M + mask = mask_m[:, None] & mask_n[None, :] + + output_offsets = offs_m[:, None] * N + offs_n[None, :] + acc_f16 = acc.to(tl.float16) + tl.store(output_ptr + output_offsets, acc_f16, mask=mask) + +def awq_gemm_allreduce_triton_oneshot_impl( + shm_ctx: np.intp, + input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + barrier_buf: torch.tensor, + scatter_buf: torch.tensor, + cur_rank, + world_size, + cfg4kernel: Dict) -> torch.tensor: + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + assert(qweight.is_contiguous()) + group_size = qweight.shape[1] * 2 // qzeros.shape[0] + + assert N > 0 and K > 0 and M > 0 + assert qweight.shape[1] == K // 2 and qweight.shape[0] == N + assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 2 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + # num_cus = config["NUM_CUS"] + # d_shape = config["D_SHAPE"] + # d_dtype = config["D_DTYPE"] + # d_dtype = torch.float16 if d_dtype == 16 else torch.float32 + + cfg4kernel.pop("D_SHAPE", None) + cfg4kernel.pop("D_DTYPE", None) + cfg4kernel.pop("NUM_CUS_STREAMK", None) + cfg4kernel["GROUP_SIZE_M"] = 1 + + output = torch.empty((M, N), dtype=torch.float16, device=input.device) + # grid = lambda META: (min( + # META["NUM_CUS"], + # triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + # ), ) + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),) + compiled = awq_gemm_allreduce_oneshot_kernel[grid]( + shm_ctx, + input, + qweight, + output, + scatter_buf, + barrier_buf, + qzeros, + scales, + M, + N, + N//2, + K, + K//2, + cur_rank, world_size, + group_size, + **cfg4kernel) + return output, compiled + +def awq_gemm_allreduce_triton_fused_impl( + input: torch.tensor, + qweight: torch.tensor, + scales: torch.tensor, + qzeros: torch.tensor, + sync_buf: torch.tensor, + barrier_buf: torch.tensor, + scatter_buf: torch.tensor, + cur_rank, + world_size, + config: Dict, + cfg4kernel: Dict, + func) -> torch.tensor: + M, K = input.shape + N = qweight.shape[0] # (N, K//2) + assert(qweight.is_contiguous()) + group_size = qweight.shape[1] * 2 // qzeros.shape[0] + + assert N > 0 and K > 0 and M > 0 + assert qweight.shape[1] == K // 2 and qweight.shape[0] == N + assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 2 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + num_cus = config["NUM_CUS"] + d_shape = config["D_SHAPE"] + d_dtype = config["D_DTYPE"] + d_dtype = torch.float16 if d_dtype == 16 else torch.float32 + + def grid(META): + # tiles_M = (M + META["BLOCK_SIZE_M"] - 1) // META["BLOCK_SIZE_M"] + # tiles_N = (N + META["BLOCK_SIZE_N"] - 1) // META["BLOCK_SIZE_N"] + tiles_M = triton.cdiv(M, META["BLOCK_SIZE_M"]) + tiles_N = triton.cdiv(N, META["BLOCK_SIZE_N"]) + total_tiles = tiles_M * tiles_N + if META["SCHEDULER"] == 0: + # dp or splitk + # add extra total_tiles for reduction + return (total_tiles * META["SPLITK"] + total_tiles,) + else: + # TODO: not supported yet + assert False + # streamk + return (META["DP_TILES"] + config["NUM_CUS_STREAMK"],) + + cfg4kernel.pop("D_SHAPE", None) + cfg4kernel.pop("D_DTYPE", None) + cfg4kernel.pop("NUM_CUS_STREAMK", None) + + fn = func[grid] + + if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + fn = partial(func.warmup, grid=grid) + + # result = torch.zeros(d_shape, dtype=d_dtype, device=input.device) + final_result = torch.zeros((M, N), dtype=d_dtype, device=input.device) + + # with torch.cuda.stream(current_stream): + fn(input, + qweight, + scatter_buf, #result, + qzeros, + scales, + final_result, + barrier_buf, + M, + N, + N//2, + K, + K//2, + cur_rank, + world_size, + group_size, + **cfg4kernel) + + # barrier_all_ipc[(1, )](cur_rank, world_size, sync_buf) + # pynvshmem.nvshmem_barrier_all() + + # if int(os.getenv("TRITON_COMPILE_ONLY", 0)) == 1: + # return + + # barrier_all_on_stream(cur_rank, world_size, sync_buf, current_stream) + + # # a seperate local reduce + # grid = lambda META: ( + # triton.cdiv(M, META['BLOCK_SIZE_M']), + # triton.cdiv(N, META['BLOCK_SIZE_N']), + # ) + # BLOCK_SIZE_M = config["BLOCK_SIZE_M"] #32 + # BLOCK_SIZE_N = config["BLOCK_SIZE_N"] #128 + # num_warps = 16 + # with torch.cuda.stream(current_stream): + # local_reduce_kernel[grid]( + # scatter_buf, + # final_result, + # M, N, + # config["SPLITK"], + # cur_rank, + # world_size, + # BLOCK_SIZE_M, + # BLOCK_SIZE_N, + # num_warps=num_warps + # ) + + return final_result + +# The tuning functions below +def prune_configs(configs, nargs, **kwargs): + + def _ceil_div(x, y): + return (x + y - 1) // y + + def _prune(config): + _config = config.all_kwargs() + all_kwargs = {**_config, **kwargs, **nargs} + + BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K = all_kwargs["BLOCK_SIZE_M"], all_kwargs["BLOCK_SIZE_N"], all_kwargs["BLOCK_SIZE_K"] + num_stages = all_kwargs["num_stages"] + + if num_stages > 1 and (BLOCK_SIZE_M * BLOCK_SIZE_K + BLOCK_SIZE_K * BLOCK_SIZE_N) * 2 > 16384: + return True + + remained = [c for c in configs if not _prune(c)] + return remained + +def update_config(M, K, N, G, cfg): + config = cfg.copy() + # 根据基本的配置计算其他参数,一则用于 launch,一则避免 kernel 内重复计算 + config["NUM_GROUPS"] = (config["BLOCK_SIZE_K"] + G - 1) // G + if config["SCHEDULER"] == 0 and config["SPLITK"] == 1: + # dp + config["DP_TILES"] = 0 + config["DANGLING_TILES"] = 0 + config["D_SHAPE"] = (M, N) + config["D_DTYPE"] = 16 + elif config["SCHEDULER"] == 0 and config["SPLITK"] > 1: + # splitk + config["DP_TILES"] = 0 + config["DANGLING_TILES"] = 0 + config["D_DTYPE"] = 32 + config["D_SHAPE"] = (config["SPLITK"], M, N) if config["USE_REDUCE_KERNEL"] else (M, N) + else: + # streamk + tiles_M = (M + config["BLOCK_SIZE_M"] - 1) // config["BLOCK_SIZE_M"] + tiles_N = (N + config["BLOCK_SIZE_N"] - 1) // config["BLOCK_SIZE_N"] + total_tiles = tiles_M * tiles_N + dangling_tiles = max(0, total_tiles - config["NUM_CUS"]) % config["NUM_CUS"] + dp_tiles = total_tiles - dangling_tiles + if dangling_tiles == 0: + # redirect to dp + config["SCHEDULER"] = 0 + config["SPLITK"] = 1 + config["USE_REDUCE_KERNEL"] = 0 + config["DP_TILES"] = 0 + config["DANGLING_TILES"] = 0 + config["D_SHAPE"] = (M, N) + config["D_DTYPE"] = 16 + else: + # still streamk + config["DP_TILES"] = dp_tiles + config["DANGLING_TILES"] = dangling_tiles + iters_per_tile = (K + config["BLOCK_SIZE_K"] - 1) // config["BLOCK_SIZE_K"] + dangling_iters = iters_per_tile * config["DANGLING_TILES"] + dangling_iters_per_cu = (dangling_iters + config["NUM_CUS"] - 1) // config["NUM_CUS"] + num_cus_streamk = (dangling_iters + dangling_iters_per_cu - 1) // dangling_iters_per_cu + config["NUM_CUS_STREAMK"] = num_cus_streamk + num_cus_per_dangling_tile = (iters_per_tile + dangling_iters_per_cu - 1) // dangling_iters_per_cu + 1 + config["D_DTYPE"] = 32 + config["D_SHAPE"] = (num_cus_per_dangling_tile, M, N) if config["USE_REDUCE_KERNEL"] else (M, N) + + return config + +''' +@triton.autotune( + configs=[ + triton.Config({ + "BLOCK_SIZE_M": M, + "BLOCK_SIZE_N": N, + }, num_warps=num_warps, num_stages=num_stages) + for M in [128, 64, 32, 16] for N in [512, 128, 64, 32, 16]\ + for num_warps in [1, 2, 4, 8, 16] for num_stages in [1, 2] + ], + key=["M", "N", "batch_size"], + perf_debug=True +) +''' + +@triton.jit +def awq_reduce_and_convert_kernel( + input_ptr, + output_ptr, + M, + N, + batch_size, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + + tl.assume(M >= 0) + tl.assume(N >= 0) + tl.assume(batch_size >= 0) + tl.assume(BLOCK_SIZE_M >= 0) + tl.assume(BLOCK_SIZE_N >= 0) + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + mask_m = offs_m < M + mask_n = offs_n < N + mask = mask_m[:, None] & mask_n[None, :] + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for batch_idx in range(batch_size): + batch_offset = batch_idx * M * N + input_offsets = batch_offset + offs_m[:, None] * N + offs_n[None, :] + input_data = tl.load(input_ptr + input_offsets, mask=mask, other=0.0) + acc += input_data + + output_offsets = offs_m[:, None] * N + offs_n[None, :] + acc_f16 = acc.to(tl.float16) + tl.store(output_ptr + output_offsets, acc_f16, mask=mask) + +def awq_reduce_and_convert_triton( + input_tensor: torch.Tensor, + output_tensor: torch.Tensor, + M: int, + N: int, + batch_size: int = 1 +) -> None: + + grid = lambda META: ( + triton.cdiv(M, META['BLOCK_SIZE_M']), + triton.cdiv(N, META['BLOCK_SIZE_N']), + ) + + BLOCK_SIZE_M = 32 + BLOCK_SIZE_N = 128 + num_warps = 16 + + awq_reduce_and_convert_kernel[grid]( + input_tensor, + output_tensor, + M, + N, + batch_size, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + num_warps=num_warps + ) diff --git a/aiter/ops/triton/gemm_allreduce_w8a8.py b/aiter/ops/triton/gemm_allreduce_w8a8.py new file mode 100644 index 0000000000000000000000000000000000000000..c34c8b81112b6d04c4650887f26954fa25449657 --- /dev/null +++ b/aiter/ops/triton/gemm_allreduce_w8a8.py @@ -0,0 +1,1091 @@ +# SPDX-License-Identifier: MIT + +import os +import json +import logging +import functools +from functools import partial +from typing import Any, Dict, List, Optional, Tuple +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import aiter.ops.triton.utils.arch_info as arch_info +from aiter import logger +from aiter.jit.utils.chip_info import get_cu_num +from aiter.ops.triton.utils.common_utils import save_kernel_path, has_kernel_cache, get_triton_cache_dir + +import torch +import triton +import triton.language as tl + +import numpy as np +import torch.distributed as dist + +from triton.language.extra.hip import libdevice +from triton.language.extra import libshmem_device + +from aiter.dist.parallel_state import GroupCoordinator + +@functools.lru_cache +def get_w8a8_block_int8_configs(N: int, K: int, block_n: int, + block_k: int) -> Optional[dict[int, Any]]: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + # device_name = current_platform.get_device_name().replace(" ", "_") + device_name = arch_info.get_device() + device_name = "BW200" if device_name.lower().startswith("bw") else device_name + + # new config by arch and cu number + arch = triton.runtime.driver.active.get_current_target().arch + num_cu = get_cu_num() + json_file_name = f"N={N},K={K},arch={arch},cu={num_cu},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "gemm/block_w8a8", json_file_name + ) + + # Fallback to device config (to be removed) + if not os.path.exists(config_file_path): + json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "gemm/block_w8a8", json_file_name + ) + + if os.path.exists(config_file_path): + with open(config_file_path) as f: + #logger.info( + # "Using configuration from %s for W8A8 Block INT8 kernel.", + # config_file_path, + #) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ("Using default W8A8 Block INT8 kernel config. Performance might " + "be sub-optimal! Config file not found at %s"), + config_file_path, + ) + return None + +# for intra-node case +@triton.heuristics( + values={ + "DIVISIBLE_M": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0, + "DIVISIBLE_N": lambda args: args["N"] % args["BLOCK_SIZE_N"] == 0, + } +) +@triton.jit +def _w8a8_block_matmul_allreduce_oneshot_kernel( + # Pointers to inputs and output + A, + B, + C, + As, + Bs, + # Shape for matmul + M, + N, + K, + # Block size for block-wise quantization + group_n, + group_k, + # Stride for inputs and output + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_As_m, + stride_As_k, + stride_Bs_k, + stride_Bs_n, + # shmem args + shm_ctx, + barrier_bufs_ptr, + scatter_bufs_ptr, + barrier_buf_ptr, + scatter_buf_ptr, + cur_rank: tl.constexpr, + local_world_size: tl.constexpr, + world_size: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + DIVISIBLE_M: tl.constexpr, + DIVISIBLE_N: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr = False +): + """Triton-accelerated function used to perform linear operations (dot + product) on input tensors `A` and `B` with block-wise quantization, and + store the result in output tensor `C`. + """ + + # set shmem device ctx (rocSHMEM required when call libshmem.func) + # libshmem_device.set_rocshmem_ctx(shm_ctx) + + node_id = cur_rank // local_world_size + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + tl.assume(pid > 0) + tl.assume(pid_m > 0) + tl.assume(pid_n > 0) + tl.assume(group_n > 0) + tl.assume(group_k > 0) + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_As_m > 0) + tl.assume(stride_As_k > 0) + tl.assume(stride_Bs_k > 0) + tl.assume(stride_Bs_n > 0) + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + if COMBINE_SCALE_LOAD: + As_ptrs = As + offs_am[:, None] * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn[:, None] * stride_Bs_n + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + a0 = tl.load(a_ptrs, + mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, + other=0.0) + b0 = tl.load(b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + tl.arange(0, 2) + a_s = tl.load(As_ptrs + offs_ks[None, :] * stride_As_k, + mask=offs_ks[None, :] <= (K - 1) // group_k, + other=0.0) + b_s = tl.load(Bs_ptrs + offs_ks[None, :] * stride_Bs_k, + mask=offs_ks[None, :] <= (K - 1) // group_k, + other=0.0) + a_s0, a_s1 = tl.split(a_s) + b_s0, b_s1 = tl.split(b_s) + + accumulator += tl.dot(a0, b0).to(tl.float32) * a_s0[:, None] * b_s0[None, :] + + a0 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, + mask=offs_k[None, :] < K - (k + 1)* BLOCK_SIZE_K, + other=0.0) + b0 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk, + mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, + other=0.0) + + accumulator += tl.dot(a0, b0).to(tl.float32) * a_s1[:, None] * b_s1[None, :] + + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += BLOCK_SIZE_K * stride_bk * 2 + else: + As_ptrs = As + offs_am * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn * stride_Bs_n + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, + mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, + other=0.0) + b = tl.load(b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_s = tl.load(As_ptrs + offs_ks * stride_As_k) + b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k) + + accumulator += tl.dot(a, b).to(tl.float32) * a_s[:, None] * b_s[None, :] + + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + + if C.dtype.element_ty == tl.bfloat16: + c = accumulator.to(tl.bfloat16) + elif C.dtype.element_ty == tl.float16: + c = accumulator.to(tl.float16) + else: + c = accumulator.to(tl.float32) + + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + offs_c = stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + # c_ptrs = C + offs_c + # scatter_buf_ptr = tl.load(scatter_bufs_ptr + cur_rank).to(tl.pointer_type(tl.float32)) + c_ptrs = scatter_buf_ptr + offs_c + + STORE_MASK_FREE: tl.constexpr = DIVISIBLE_M & DIVISIBLE_N + if STORE_MASK_FREE: + tl.store(c_ptrs, c) + c_mask = None + else: + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + # Signal to other ranks that we have completed this tile + for i in range(1, local_world_size): + remote = (cur_rank + i) % local_world_size + node_id * local_world_size + remote_base_ptr = tl.load(barrier_bufs_ptr + remote).to(tl.pointer_type(tl.int32)) + # remote_base_ptr = libshmem_device.remote_ptr(barrier_bufs_ptr, remote).to(tl.pointer_type(tl.int32)) + tl.atomic_add(remote_base_ptr + pid, 1, scope="sys", sem="release") + + # consumer + # local_base_ptr = tl.load(barrier_bufs_ptr + cur_rank).to(tl.pointer_type(tl.int32)) + while tl.atomic_cas(barrier_buf_ptr + pid, world_size - 1, 0, scope="sys", sem="acquire") != (world_size - 1): + pass + + # acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + acc = accumulator + + # intra-node reduce + for i in range(1, local_world_size): + rank_id = (cur_rank + i) % local_world_size + node_id * local_world_size + scatter_buf_ptr = tl.load(scatter_bufs_ptr + rank_id).to(tl.pointer_type(tl.float32)) + # scatter_buf_ptr = libshmem_device.remote_ptr(scatter_bufs_ptr, rank_id).to(tl.pointer_type(tl.float32)) + acc += tl.load(scatter_buf_ptr + offs_c, mask=c_mask) + acc_f16 = acc.to(tl.float16) + + if STORE_MASK_FREE: + tl.store(C + offs_c, acc_f16) + else: + tl.store(C + offs_c, acc_f16, mask=c_mask, cache_modifier=".wt") + + +# for intra-node case +@triton.heuristics( + values={ + "DIVISIBLE_M": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0, + "DIVISIBLE_N": lambda args: args["N"] % args["BLOCK_SIZE_N"] == 0, + 'DIVISIBLE_K': lambda args: args['K'] % args['BLOCK_SIZE_K'] == 0, + } +) +# @triton.jit +@triton.jit +def _w8a8_block_matmul_scatter_kernel( + # Pointers to inputs and output + A, + B, + C, + As, + Bs, + # Shape for matmul + M, + N, + K, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Stride for inputs and output + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_As_m, + stride_As_k, + stride_Bs_k, + stride_Bs_n, + # shmem args + shm_ctx, + barrier_bufs_ptr, + scatter_bufs_ptr, + barrier_buf_ptr, + scatter_buf_ptr, + cur_rank: tl.constexpr, + local_world_size: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + DIVISIBLE_M: tl.constexpr, + DIVISIBLE_N: tl.constexpr, + DIVISIBLE_K: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr = False, + USE_MLS_LOAD: tl.constexpr = False +): + """Triton-accelerated function used to perform linear operations (dot + product) on input tensors `A` and `B` with block-wise quantization, and + store the result in output tensor `C`. + """ + + # set shmem device ctx (rocSHMEM required when call libshmem.func) + # libshmem_device.set_rocshmem_ctx(shm_ctx) + # tid = libdevice.thread_idx(0) # noqa + + # node_id = cur_rank // local_world_size + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + tiles_per_rank_n = tl.cdiv(num_pid_n, local_world_size) + N_per_rank = tl.cdiv(N, local_world_size) + rank_offset = pid_n // tiles_per_rank_n + + tl.assume(pid > 0) + tl.assume(pid_m > 0) + tl.assume(pid_n > 0) + tl.assume(group_n > 0) + tl.assume(group_k > 0) + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_As_m > 0) + tl.assume(stride_As_k > 0) + tl.assume(stride_Bs_k > 0) + tl.assume(stride_Bs_n > 0) + + if group_k > 0: + tl.static_assert(BLOCK_SIZE_K <= group_k and group_k % BLOCK_SIZE_K == 0, + "BLOCK_SIZE_K must be divisible by GROUP_SIZE_K") + if COMBINE_SCALE_LOAD: # used for use_int8_w8a8 + tl.static_assert(stride_As_k == 1, + "COMBINE_SCALE_LOAD implictly stride_As_k == 1!") + tl.static_assert(DIVISIBLE_K == True and BLOCK_SIZE_K == group_k, + "COMBINE_SCALE_LOAD only add and verify on block_k_diviable!") + if USE_MLS_LOAD: + tl.static_assert(DIVISIBLE_K == True and DIVISIBLE_N == True, + "USE_MLS_LOAD must require block_k_diviable and block_n_diviable!") + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + mls_offs_k = 0 + if COMBINE_SCALE_LOAD: + As_ptrs = As + offs_am[:, None] * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn[:, None] * stride_Bs_n + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + a0 = tl.load(a_ptrs, + mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, + other=0.0) + if not USE_MLS_LOAD: + b0 = tl.load(b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0) + else: + b0 = tl.matrix_load( + B, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + tl.arange(0, 2) + a_s = tl.load(As_ptrs + offs_ks[None, :] * stride_As_k, + mask=offs_ks[None, :] <= (K - 1) // group_k, + other=0.0) + b_s = tl.load(Bs_ptrs + offs_ks[None, :] * stride_Bs_k, + mask=offs_ks[None, :] <= (K - 1) // group_k, + other=0.0) + a_s0, a_s1 = tl.split(a_s) + b_s0, b_s1 = tl.split(b_s) + + accumulator += tl.dot(a0, b0).to(tl.float32) * a_s0[:, None] * b_s0[None, :] + + a0 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, + mask=offs_k[None, :] < K - (k + 1)* BLOCK_SIZE_K, + other=0.0) + if not USE_MLS_LOAD: + b0 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk, + mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, + other=0.0) + else: + b0 = tl.matrix_load( + B, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k + BLOCK_SIZE_K, (pid_n * BLOCK_SIZE_N) % N]) + + accumulator += tl.dot(a0, b0).to(tl.float32) * a_s1[:, None] * b_s1[None, :] + + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += BLOCK_SIZE_K * stride_bk * 2 + mls_offs_k += BLOCK_SIZE_K * 2 + else: + As_ptrs = As + offs_am * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn * stride_Bs_n + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, + mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, + other=0.0) + if not USE_MLS_LOAD: + b = tl.load(b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0) + else: + b = tl.matrix_load( + B, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_s = tl.load(As_ptrs + offs_ks * stride_As_k) + b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k) + + accumulator += tl.dot(a, b).to(tl.float32) * a_s[:, None] * b_s[None, :] + + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + mls_offs_k += BLOCK_SIZE_K + + + if C.dtype.element_ty == tl.bfloat16: + c = accumulator.to(tl.bfloat16) + elif C.dtype.element_ty == tl.float16: + c = accumulator.to(tl.float16) + else: + c = accumulator.to(tl.float32) + + # map columns into the slice owned by cur_rank + local_pid_n = pid_n % tiles_per_rank_n + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = (cur_rank * N_per_rank + local_pid_n * BLOCK_SIZE_N) + tl.arange(0, BLOCK_SIZE_N) + offs_c = stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + + scatter_ptr = tl.load(scatter_bufs_ptr + rank_offset).to(tl.pointer_type(tl.float32)) + c_ptrs = scatter_ptr + offs_c + + # # for debug + # offs_cm1 = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + # offs_cn1 = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + # offs_c1 = stride_cm * offs_cm1[:, None] + stride_cn * offs_cn1[None, :] + # c_ptrs1 = C + offs_c1 + # c1 = tl.load(c_ptrs1) + + STORE_MASK_FREE: tl.constexpr = DIVISIBLE_M & DIVISIBLE_N + if STORE_MASK_FREE: + tl.store(c_ptrs, c) + c_mask = None + else: + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + ''' + # Signal to other ranks that we have completed this tile + for i in range(1, local_world_size): + remote = (cur_rank + i) % local_world_size + node_id * local_world_size + remote_base_ptr = tl.load(barrier_bufs_ptr + remote).to(tl.pointer_type(tl.int32)) + # remote_base_ptr = libshmem_device.remote_ptr(barrier_bufs_ptr, remote).to(tl.pointer_type(tl.int32)) + tl.atomic_add(remote_base_ptr + pid, 1, scope="sys", sem="release") + + # consumer + # local_base_ptr = tl.load(barrier_bufs_ptr + cur_rank).to(tl.pointer_type(tl.int32)) + while tl.atomic_cas(barrier_buf_ptr + pid, world_size - 1, 0, scope="sys", sem="acquire") != (world_size - 1): + pass + + # acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + acc = accumulator + + # intra-node reduce + for i in range(1, local_world_size): + rank_id = (cur_rank + i) % local_world_size + node_id * local_world_size + scatter_buf_ptr = tl.load(scatter_bufs_ptr + rank_id).to(tl.pointer_type(tl.float32)) + # scatter_buf_ptr = libshmem_device.remote_ptr(scatter_bufs_ptr, rank_id).to(tl.pointer_type(tl.float32)) + acc += tl.load(scatter_buf_ptr + offs_c, mask=c_mask) + acc_f16 = acc.to(tl.float16) + + if STORE_MASK_FREE: + tl.store(C + offs_c, acc_f16) + else: + tl.store(C + offs_c, acc_f16, mask=c_mask, cache_modifier=".wt") + ''' + +def gemm_allreduce_w8a8( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, + barriers_buf: torch.tensor, + scatters_buf: torch.tensor, + barrier_buf: torch.tensor, + scatter_buf: torch.tensor, + max_tiles: int, + tp_group: dist.ProcessGroup, + local_world_size: int, + shmem_ctx: Optional[np.intp] = None, + configs: Optional[Dict] = None +) -> torch.Tensor: + """This function performs matrix multiplication with block-wise + quantization. + + It takes two input tensors `A` and `B` with scales `As` and `Bs`. + The output is returned in the specified `output_dtype`. + + Args: + A: The input tensor, e.g., activation. + B: The input tensor, e.g., weight. + As: The per-token-group quantization scale for `A`. + Bs: The per-block quantization scale for `B`. + block_size: The block size for per-block quantization. It should be + 2-dim, e.g., [128, 128]. + output_dytpe: The dtype of the returned tensor. + + Returns: + torch.Tensor: The result of matmul. + """ + + # get from tp_group + cur_rank = tp_group.rank() + world_size = tp_group.size() + + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + C_shape = A.shape[:-1] + (N, ) + C = A.new_empty(C_shape, dtype=output_dtype) + + if configs is None: + configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1]) + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Default config + # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 2, + "COMBINE_SCALE_LOAD": False, + "num_warps": 4, + "num_stages": 1, + } + + # check + cur_total_tiles = \ + triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(N, config["BLOCK_SIZE_N"]) + + assert cur_total_tiles <= max_tiles, ( + f"{cur_total_tiles=} should not be bypass {max_tiles=}, " + "please check the min BLOCK_SIZE_M & BLOCK_SIZE_N in awq_gemm config and pass them to register_shmem") + + def grid(META): + return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * + triton.cdiv(N, META["BLOCK_SIZE_N"]), ) + + _kernel = _w8a8_block_matmul_allreduce_oneshot_kernel[grid]( + A, + B, + C, + As, + Bs, + M, + N, + K, + block_n, + block_k, + A.stride(-2), + A.stride(-1), + B.stride(1), + B.stride(0), + C.stride(-2), + C.stride(-1), + As.stride(-2), + As.stride(-1), + Bs.stride(1), + Bs.stride(0), + # shmem args + shmem_ctx, + barriers_buf, + scatters_buf, + barrier_buf, + scatter_buf, + cur_rank, + local_world_size, + world_size, + **config, + ) + + # # Create kernel path metadata, which mapping autotune's key to kernel path. + # # input_dtype = str(input.dtype).split('.')[-1] + # block_shape_n = B.shape[0] // Bs.shape[0] + # block_shape_k = B.shape[1] // Bs.shape[1] + # path_key = str((M, K, N, block_shape_n, block_shape_k, config['BLOCK_SIZE_M'], config['BLOCK_SIZE_N'], config['BLOCK_SIZE_K'], config['GROUP_SIZE_M'], config['COMBINE_SCALE_LOAD'])) + # arch = triton.runtime.driver.active.get_current_target().arch + # save_kernel_path(f"{_kernel.name}-{arch}-cu{get_cu_num()}-blockwise-w8a8.json", + # path_key, os.path.basename(_kernel.perf_ir_path)) + + return C + +# support internode with 3 stages +# intra-scatter + inter-allreduce + intra-allgather +def gemm_allreduce_w8a8_v2( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, + barriers_buf: torch.tensor, + scatters_buf: torch.tensor, + reduces_buf: torch.tensor, + barrier_buf: torch.tensor, + scatter_buf: torch.tensor, + reduce_buf: torch.tensor, + max_seq_len: int, + max_tiles: int, + tp: GroupCoordinator, + pp: GroupCoordinator, + shmem_ctx: Optional[np.intp] = None, + configs: Optional[Dict] = None +) -> torch.Tensor: + """This function performs matrix multiplication with block-wise + quantization. + + It takes two input tensors `A` and `B` with scales `As` and `Bs`. + The output is returned in the specified `output_dtype`. + + Args: + A: The input tensor, e.g., activation. + B: The input tensor, e.g., weight. + As: The per-token-group quantization scale for `A`. + Bs: The per-block quantization scale for `B`. + block_size: The block size for per-block quantization. It should be + 2-dim, e.g., [128, 128]. + output_dytpe: The dtype of the returned tensor. + + Returns: + torch.Tensor: The result of matmul. + """ + + # get from tp_group + world_size = tp.world_size + cur_rank = tp.rank_in_group + # nnodes = world_size // local_world_size + + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + # C_shape = A.shape[:-1] + (N, ) + # C = A.new_empty(C_shape, dtype=output_dtype) + # # for debug + # C = A.new_zeros(C_shape, dtype=output_dtype) + # rows = C.shape[0] + # cols = C.shape[1] + # increment = torch.arange(rows * cols).reshape(rows, cols).to(C.device) + # C = C + increment + + if configs is None: + configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1]) + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Default config + # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 2, + "COMBINE_SCALE_LOAD": False, + "USE_MLS_LOAD": False, + "num_warps": 4, + "num_stages": 1, + } + + # check + cur_total_tiles = \ + triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(N, config["BLOCK_SIZE_N"]) + + assert cur_total_tiles <= max_tiles, ( + f"{cur_total_tiles=} should not be bypass {max_tiles=}, " + "please check the min BLOCK_SIZE_M & BLOCK_SIZE_N in awq_gemm config and pass them to register_shmem") + + # test divisible case first + assert triton.cdiv(N, config["BLOCK_SIZE_N"]) >= world_size, ( + f'{N=} is too small compared to {config["BLOCK_SIZE_N"]=}' + 'please make sure num_pid_n at least >= local_world_size for efficiency' + ) + + # Stage 1: intra node gemm + scatter + # A * B -> scatter_buf + def grid(META): + return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * + triton.cdiv(N, META["BLOCK_SIZE_N"]), ) + _kernel_gemm = _w8a8_block_matmul_scatter_kernel[grid]( + A, + B, + reduce_buf, #C, + As, + Bs, + M, + N, + K, + block_n, + block_k, + A.stride(-2), + A.stride(-1), + B.stride(1), + B.stride(0), + reduce_buf.stride(-2), + reduce_buf.stride(-1), + As.stride(-2), + As.stride(-1), + Bs.stride(1), + Bs.stride(0), + # shmem args + shmem_ctx, + barriers_buf, + scatters_buf, + barrier_buf, + scatter_buf, + cur_rank, + world_size, + **config, + ) + + _kernel_barrier = barrier_all_ipc[(1, )](cur_rank, world_size, barriers_buf) + + N_per_rank = triton.cdiv(N, world_size) + + grid_reduce = lambda META: ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N_per_rank, META["BLOCK_SIZE_N"]), + ) + _kernel_scatter_reduce = scatter_reduce_gather_kernel[grid_reduce]( + reduce_buf, + reduces_buf, + scatter_buf, + M, + N, + N_per_rank, + scatter_buf.stride(-2), + scatter_buf.stride(-1), + reduce_buf.stride(-2), + reduce_buf.stride(-1), + cur_rank=cur_rank, + local_world_size=world_size, + BLOCK_SIZE_M=32, + BLOCK_SIZE_N=128, + num_warps=16, + num_stages=2, + ) + + # return C + return reduce_buf[:M] + # return reduce_buf # for debug + + +@triton.jit +def barrier_all_ipc(rank, num_ranks, comm_buf_base_ptrs): + tid = libdevice.thread_idx(axis=0) # noqa: F841 + for i in range(num_ranks): + remote_base_ptr = tl.load(comm_buf_base_ptrs + i).to(tl.pointer_type(tl.int32)) + # remote_base_ptr = comm_buf_base_ptrs[i] + while tl.atomic_cas(remote_base_ptr + rank, 0, 1, scope="sys", sem="release") != 0: + pass + + for i in range(num_ranks): + local_base_ptr = tl.load(comm_buf_base_ptrs + rank).to(tl.pointer_type(tl.int32)) + # local_base_ptr = comm_buf_base_ptrs[rank] + while tl.atomic_cas(local_base_ptr + i, 1, 0, scope="sys", sem="acquire") != 1: + pass + + tl.debug_barrier() + +@triton.jit +def scatter_reduce_local_kernel( + out_ptr, + scatter_buf_ptr, + M, + N, + N_per_rank, + stride_scatter_m, + stride_scatter_n, + stride_out_m, + stride_out_n, + cur_rank: tl.constexpr, + local_world_size: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + """Reduce all scatter buffers into a local output slice of shape (M, N_per_rank).""" + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N_per_rank, BLOCK_SIZE_N) + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n_local = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + # accum = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + # mask = (offs_m[:, None] < M) & (offs_n_local[None, :] < N) + ptrs = scatter_buf_ptr + offs_m[:, None] * stride_scatter_m + offs_n_local[None, :] * stride_scatter_n + accum = tl.load(ptrs) + # for r in range(0, local_world_size): + for r in range(1, local_world_size): + offs_n_global = offs_n_local + r * N_per_rank + mask = (offs_m[:, None] < M) & (offs_n_global[None, :] < N) + ptrs = scatter_buf_ptr + offs_m[:, None] * stride_scatter_m + offs_n_global[None, :] * stride_scatter_n + accum += tl.load(ptrs, mask=mask) + + # offs_n_local = offs_n_local + (cur_rank % local_world_size) * N_per_rank + out_ptrs = out_ptr + offs_m[:, None] * stride_out_m + offs_n_local[None, :] * stride_out_n + tl.store(out_ptrs, accum, mask=(offs_m[:, None] < M) & (offs_n_local[None, :] < N_per_rank)) + +@triton.heuristics( + values={ + "DIVISIBLE_M": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0, + "DIVISIBLE_N": lambda args: ( + args["N_per_rank"] % args["BLOCK_SIZE_N"] == 0 + and args["N"] % args["local_world_size"] == 0 + ), + } +) +# @triton.jit +@triton.jit +def scatter_reduce_gather_kernel( + out_ptr, + outs_ptr, + scatter_buf_ptr, + M, + N, + N_per_rank, + stride_scatter_m, + stride_scatter_n, + stride_out_m, + stride_out_n, + cur_rank: tl.constexpr, + local_world_size: tl.constexpr, + DIVISIBLE_M: tl.constexpr, + DIVISIBLE_N: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + """Reduce all scatter buffers into a local output slice of shape (M, N_per_rank).""" + + # tid = libdevice.thread_idx(0) + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N_per_rank, BLOCK_SIZE_N) + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n_local = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + tl.multiple_of(offs_m, BLOCK_SIZE_M) + tl.multiple_of(offs_n_local, BLOCK_SIZE_N) + tl.max_contiguous(offs_n_local, BLOCK_SIZE_N) + + STORE_MASK_FREE: tl.constexpr = DIVISIBLE_M & DIVISIBLE_N + + # accum = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + ptrs = scatter_buf_ptr + offs_m[:, None] * stride_scatter_m + offs_n_local[None, :] * stride_scatter_n + if STORE_MASK_FREE: + accum = tl.load(ptrs) + else: + mask = (offs_m[:, None] < M) & (offs_n_local[None, :] < N_per_rank) + accum = tl.load(ptrs, mask) + # scatter buffer add locally across N_per_rank + for r in range(1, local_world_size): + offs_n_global = offs_n_local + r * N_per_rank + n_end_rank = (r + 1) * N_per_rank + n_end = tl.minimum(N, n_end_rank) + ptrs = scatter_buf_ptr + offs_m[:, None] * stride_scatter_m + offs_n_global[None, :] * stride_scatter_n + if STORE_MASK_FREE: + accum += tl.load(ptrs) + else: + mask = (offs_m[:, None] < M) & (offs_n_global[None, :] < n_end) + accum += tl.load(ptrs, mask=mask) + + # scatter buffer push reduce results to local/peer rank + offs_n_local = offs_n_local + cur_rank * N_per_rank + n_end_rank = (cur_rank + 1) * N_per_rank + n_end = tl.minimum(N, n_end_rank) + for r in range(local_world_size): + remote_out_ptr = tl.load(outs_ptr + r).to(tl.pointer_type(out_ptr.dtype.element_ty)) + out_ptrs = remote_out_ptr + offs_m[:, None] * stride_out_m + offs_n_local[None, :] * stride_out_n + if STORE_MASK_FREE: + tl.store(out_ptrs, accum) + else: + tl.store(out_ptrs, accum, mask=(offs_m[:, None] < M) & (offs_n_local[None, :] < n_end)) #, cache_modifier=".cs") + +@triton.jit +def scatter_gather_local_kernel( + out_ptr, + scatter_buf_ptr, + scatter_bufs_ptr, + max_seq_len: tl.constexpr, + M: tl.constexpr, + N: tl.constexpr, + N_per_rank: tl.constexpr, + stride_scatter_m: tl.constexpr, + stride_scatter_n: tl.constexpr, + stride_out_m: tl.constexpr, + stride_out_n: tl.constexpr, + cur_rank: tl.constexpr, + local_world_size: tl.constexpr, + nnodes: tl.constexpr, + DO_REDUCE: tl.constexpr, # need reduce locally if nnodes > 1 + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + """Gather the peer ranks scatter_bufs M * N_per_rank and concat along N dim""" + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N_per_rank, BLOCK_SIZE_N) + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + # the cur_rank id inside the node (0 ~ local_world_size - 1) + cur_node = cur_rank // local_world_size + cur_rank_local = cur_rank % local_world_size + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n_local = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + # first do local copy + offs_n_cur_rank = offs_n_local + cur_rank_local * N_per_rank + offs_in = offs_m[:, None] * stride_scatter_m + offs_n_local * stride_scatter_n + ptrs = scatter_buf_ptr + offs_in + mask = (offs_m[:, None] < M) & (offs_n_local[None, :] < N_per_rank) + accum = tl.load(ptrs, mask=mask) + + # local reduce if nnodes > 1 + if DO_REDUCE: + for i in range(1, nnodes): + ptrs = scatter_buf_ptr + offs_in + i * max_seq_len * N_per_rank + accum += tl.load(ptrs, mask=mask) + + offs_out = offs_m[:, None] * stride_out_m + offs_n_cur_rank[None, :] * stride_out_n + out_ptrs = out_ptr + offs_out + + n_end_rank = (cur_rank_local + 1) * N_per_rank + n_end = tl.minimum(N, n_end_rank) + + mask = (offs_m[:, None] < M) & (offs_n_local[None, :] < n_end) + tl.store(out_ptrs, accum, mask) + + for r in range(1, local_world_size): + peer_rank_local = (cur_rank_local + r) % local_world_size + peer_rank = peer_rank_local + local_world_size * cur_node + offs_n_global = offs_n_local + peer_rank_local * N_per_rank + mask_in = (offs_m[:, None] < M) & (offs_n_local[None, :] < N_per_rank) + # load remote ptr + remote_ptr = tl.load(scatter_bufs_ptr + peer_rank).to(tl.pointer_type(tl.float32)) + offs_in = offs_m[:, None] * stride_scatter_m + offs_n_local[None, :] * stride_scatter_n + ptrs = remote_ptr + offs_in + a = tl.load(ptrs, mask=mask_in, other=0.0) + offs_out = offs_m[:, None] * stride_out_m + offs_n_global[None, :] * stride_out_n + n_end_rank = (peer_rank_local + 1) * N_per_rank + n_end = tl.minimum(N, n_end_rank) + mask_out = (offs_m[:, None] < M) & (offs_n_global[None, :] < n_end_rank) + out_ptrs = out_ptr + offs_out + tl.store(out_ptrs, a, mask_out) + + +@triton.jit +def putmem_kernel( + shm_ctx, + out_ptr, # inout + max_seq_len: tl.constexpr, + M: tl.constexpr, + N_per_rank: tl.constexpr, + cur_rank: tl.constexpr, + local_world_size: tl.constexpr, + nnodes: tl.constexpr, + TOKEN_BYTES: tl.constexpr, +): + # set rocshmem device ctx + libshmem_device.set_rocshmem_ctx(shm_ctx) + + pid = tl.program_id(0) + byte_cnt = M * N_per_rank * TOKEN_BYTES + + for i in range(1, nnodes): + peer_node = (i + 1) % nnodes + peer_rank = peer_node * local_world_size + cur_rank % local_world_size + dst_ptr = out_ptr + i * max_seq_len * N_per_rank + libshmem_device.putmem_nbi_wg( + dst_ptr, + out_ptr, + byte_cnt, + peer_rank, + ) + + # libshmem_device.fence() + return diff --git a/aiter/ops/triton/gemm_w8a8.py b/aiter/ops/triton/gemm_w8a8.py new file mode 100644 index 0000000000000000000000000000000000000000..15a57a4ed2e79b59900a9c508f3c70a1f3cd6e17 --- /dev/null +++ b/aiter/ops/triton/gemm_w8a8.py @@ -0,0 +1,408 @@ +# SPDX-License-Identifier: MIT + +import os +import json +import logging +import functools +from functools import partial +from typing import Any, Dict, List, Optional, Tuple +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import aiter.ops.triton.utils.arch_info as arch_info +from aiter import logger +from aiter.jit.utils.chip_info import get_cu_num +from aiter.ops.triton.utils.common_utils import save_kernel_path, has_kernel_cache, get_triton_cache_dir +from aiter.jit.utils.chip_info import get_gfx, get_cu_num + +import torch +import triton +import triton.language as tl + +@functools.lru_cache +def get_w8a8_block_int8_configs(N: int, K: int, block_n: int, + block_k: int) -> Optional[dict[int, Any]]: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + # device_name = current_platform.get_device_name().replace(" ", "_") + device_name = arch_info.get_device() + device_name = "BW200" if device_name.lower().startswith("bw") else device_name + + # new config by arch and cu number + arch = triton.runtime.driver.active.get_current_target().arch + num_cu = get_cu_num() + json_file_name = f"N={N},K={K},arch={arch},cu={num_cu},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "gemm/block_w8a8", json_file_name + ) + + # Fallback to device config (to be removed) + if not os.path.exists(config_file_path): + json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "gemm/block_w8a8", json_file_name + ) + + if os.path.exists(config_file_path): + with open(config_file_path) as f: + #logger.info( + # "Using configuration from %s for W8A8 Block INT8 kernel.", + # config_file_path, + #) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ("Using default W8A8 Block INT8 kernel config. Performance might " + "be sub-optimal! Config file not found at %s"), + config_file_path, + ) + return None + +''' +configs = [ + triton.Config( + {"BLOCK_SIZE_M": BLOCK_SIZE_M, "BLOCK_SIZE_N": BLOCK_SIZE_N, "BLOCK_SIZE_K": BLOCK_SIZE_K, + "GROUP_SIZE_M": GROUP_SIZE_M, "COMBINE_SCALE_LOAD": COMBINE_SCALE_LOAD, + "USE_MLS_LOAD": USE_MLS_LOAD}, + num_warps=num_warps, num_stages=num_stages) + # for BLOCK_SIZE_M in [16, 32, 64, 128, 256] + # for BLOCK_SIZE_N in [16, 32, 64, 128, 256] + # for BLOCK_SIZE_M in [16, 32] + for BLOCK_SIZE_M in [16, 32, 64] + for BLOCK_SIZE_N in [16, 32, 64, 128] + for BLOCK_SIZE_K in [128] + for GROUP_SIZE_M in [16, 32, 64] + # for GROUP_SIZE_M in [32] + for COMBINE_SCALE_LOAD in [True, False] + # for COMBINE_SCALE_LOAD in [False] + for USE_MLS_LOAD in [True, False] + # for USE_MLS_LOAD in [True] + for num_warps in [1, 2, 4, 8, 16] + for num_stages in [1, 2] +] + +# @triton.autotune( +@triton.utils.hcutune( + configs=configs, + key=["M", "K", "N", "group_n", "group_k"], + # perf_debug=True +) +''' + +@triton.heuristics( + values={ + "DIVISIBLE_M": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0, + "DIVISIBLE_N": lambda args: args["N"] % args["BLOCK_SIZE_N"] == 0, + 'DIVISIBLE_K': lambda args: args['K'] % args['BLOCK_SIZE_K'] == 0, + } +) +@triton.jit +def _w8a8_block_int8_matmul( + # Pointers to inputs and output + A, + B, + C, + As, + Bs, + # Shape for matmul + M, + N, + K, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Stride for inputs and output + stride_am: tl.constexpr, + stride_ak: tl.constexpr, + stride_bk: tl.constexpr, + stride_bn: tl.constexpr, + stride_cm: tl.constexpr, + stride_cn: tl.constexpr, + stride_As_m: tl.constexpr, + stride_As_k: tl.constexpr, + stride_Bs_k: tl.constexpr, + stride_Bs_n: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + DIVISIBLE_M: tl.constexpr, + DIVISIBLE_N: tl.constexpr, + DIVISIBLE_K: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr = False, + USE_MLS_LOAD: tl.constexpr = False +): + """Triton-accelerated function used to perform linear operations (dot + product) on input tensors `A` and `B` with block-wise quantization, and + store the result in output tensor `C`. + """ + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + tl.assume(pid > 0) + tl.assume(pid_m > 0) + tl.assume(pid_n > 0) + tl.assume(group_n > 0) + tl.assume(group_k > 0) + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_As_m > 0) + tl.assume(stride_As_k > 0) + tl.assume(stride_Bs_k > 0) + tl.assume(stride_Bs_n > 0) + + if group_k > 0: + tl.static_assert(BLOCK_SIZE_K <= group_k and group_k % BLOCK_SIZE_K == 0, + "BLOCK_SIZE_K must be divisible by GROUP_SIZE_K") + if COMBINE_SCALE_LOAD: # used for use_int8_w8a8 + tl.static_assert(stride_As_k == 1, + "COMBINE_SCALE_LOAD implictly stride_As_k == 1!") + tl.static_assert(DIVISIBLE_K == True and BLOCK_SIZE_K == group_k, + "COMBINE_SCALE_LOAD only add and verify on block_k_diviable!") + if USE_MLS_LOAD: + tl.static_assert(DIVISIBLE_K == True and DIVISIBLE_N == True, + "USE_MLS_LOAD must require block_k_diviable and block_n_diviable!") + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + mls_offs_k = 0 + if COMBINE_SCALE_LOAD: + As_ptrs = As + offs_am[:, None] * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn[:, None] * stride_Bs_n + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + a0 = tl.load(a_ptrs, + mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, + other=0.0) + if not USE_MLS_LOAD: + b0 = tl.load(b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0) + else: + b0 = tl.matrix_load( + B, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + tl.arange(0, 2) + a_s = tl.load(As_ptrs + offs_ks[None, :] * stride_As_k, + mask=offs_ks[None, :] <= (K - 1) // group_k, + other=0.0) + b_s = tl.load(Bs_ptrs + offs_ks[None, :] * stride_Bs_k, + mask=offs_ks[None, :] <= (K - 1) // group_k, + other=0.0) + a_s0, a_s1 = tl.split(a_s) + b_s0, b_s1 = tl.split(b_s) + + accumulator += tl.dot(a0, b0).to(tl.float32) * a_s0[:, None] * b_s0[None, :] + + a0 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, + mask=offs_k[None, :] < K - (k + 1)* BLOCK_SIZE_K, + other=0.0) + if not USE_MLS_LOAD: + b0 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk, + mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, + other=0.0) + else: + b0 = tl.matrix_load( + B, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k + BLOCK_SIZE_K, (pid_n * BLOCK_SIZE_N) % N]) + + accumulator += tl.dot(a0, b0).to(tl.float32) * a_s1[:, None] * b_s1[None, :] + + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += BLOCK_SIZE_K * stride_bk * 2 + mls_offs_k += BLOCK_SIZE_K * 2 + else: + As_ptrs = As + offs_am * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn * stride_Bs_n + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, + mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, + other=0.0) + if not USE_MLS_LOAD: + b = tl.load(b_ptrs, + mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, + other=0.0) + else: + b = tl.matrix_load( + B, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_s = tl.load(As_ptrs + offs_ks * stride_As_k) + b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k) + + accumulator += tl.dot(a, b).to(tl.float32) * a_s[:, None] * b_s[None, :] + + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + mls_offs_k += BLOCK_SIZE_K + + + if C.dtype.element_ty == tl.bfloat16: + c = accumulator.to(tl.bfloat16) + elif C.dtype.element_ty == tl.float16: + c = accumulator.to(tl.float16) + else: + c = accumulator.to(tl.float32) + + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + offs_c = stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_ptrs = C + offs_c + + STORE_MASK_FREE: tl.constexpr = DIVISIBLE_M & DIVISIBLE_N + if STORE_MASK_FREE: + tl.store(c_ptrs, c) + else: + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +# def gemm_a8w8( +# x: torch.Tensor, +# w: torch.Tensor, +# x_scale: torch.Tensor, +# w_scale: torch.Tensor, +# bias: Optional[torch.Tensor] = None, +# dtype: Optional[float] = torch.bfloat16, +# y: Optional[torch.Tensor] = None, +# config: Optional[dict] = None, +# ): + +def gemm_w8a8( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + """This function performs matrix multiplication with block-wise + quantization. + + It takes two input tensors `A` and `B` with scales `As` and `Bs`. + The output is returned in the specified `output_dtype`. + + Args: + A: The input tensor, e.g., activation. + B: The input tensor, e.g., weight. + As: The per-token-group quantization scale for `A`. + Bs: The per-block quantization scale for `B`. + block_size: The block size for per-block quantization. It should be + 2-dim, e.g., [128, 128]. + output_dytpe: The dtype of the returned tensor. + + Returns: + torch.Tensor: The result of matmul. + """ + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + C_shape = A.shape[:-1] + (N, ) + C = A.new_empty(C_shape, dtype=output_dtype) + + configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1]) + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Default config + # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": False, + "USE_MLS_LOAD": False, + "num_warps": 4, + "num_stages": 3, + } + + def grid(META): + return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * + triton.cdiv(N, META["BLOCK_SIZE_N"]), ) + + _w8a8_block_int8_matmul[grid]( + A, + B, + C, + As, + Bs, + M, + N, + K, + block_n, + block_k, + A.stride(-2), + A.stride(-1), + B.stride(1), + B.stride(0), + C.stride(-2), + C.stride(-1), + As.stride(-2), + As.stride(-1), + Bs.stride(1), + Bs.stride(0), + **config, + ) + + # logger.info(f"triton kernel regs: {_compiled_kernel.n_regs}, spills: {_compiled_kernel.n_spills}") + # best_config = _w8a8_block_int8_matmul.best_config + # print("Best Config:", best_config) + return C diff --git a/aiter/ops/triton/gluon/pa_mqa_logits.py b/aiter/ops/triton/gluon/pa_mqa_logits.py new file mode 100644 index 0000000000000000000000000000000000000000..072389177581adb94a7d26606a8067cf54b1cc51 --- /dev/null +++ b/aiter/ops/triton/gluon/pa_mqa_logits.py @@ -0,0 +1,1904 @@ +import triton +import triton.language as tl + +from triton.experimental import gluon +from triton.experimental.gluon import language as gl + +try: + from triton.experimental.gluon.language.amd.cdna3 import ( + sched_barrier as _amd_iglp_sched_barrier, + sched_group_barrier as _amd_iglp_sched_group_barrier, + s_set_prio as _amd_s_set_prio, + ) +except ImportError: + # ignore iglp hint + @gluon.jit + def _amd_iglp_sched_barrier(inst_mask): + pass + + @gluon.jit + def _amd_iglp_sched_group_barrier(inst_mask, cnt, _): + pass + + @gluon.jit + def _amd_s_set_prio(prio): + pass + + +# for some newer triton>=3.5 version, a 3D instr_shape is required. +try: + _: gl.constexpr = gl.amd.AMDMFMALayout( + version=4, + instr_shape=[16, 16], + transposed=False, + warps_per_cta=[1, 1], + tiles_per_warp=[1, 1], + ) + _Use_2d_instr_shape_mfma_layout = tl.constexpr(True) +except Exception: + _Use_2d_instr_shape_mfma_layout = tl.constexpr(False) + + +@triton.jit +def _sum_combine(a, b): + return a + b + + +@gluon.jit +def _gluon_deepgemm_fp8_paged_mqa_logits( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + context_len_ptr, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + max_block_len, + SplitKV, + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + KVBlockSize: tl.constexpr = 1, +): + pid = tl.program_id(0) + num_block_q_head = tl.cdiv(heads_num, ChunkQ) + + pid_q_head, remain_pid = pid % num_block_q_head, pid // num_block_q_head + pid_next_n, remain_pid = remain_pid % next_n, remain_pid // next_n + pid_batch, pid_split_kv = remain_pid % batch_size, remain_pid // batch_size + + context_length = gl.load(context_len_ptr + pid_batch) + + context_chunk_num = tl.cdiv(context_length, ChunkK) + split_context_chunk_num = tl.cdiv(context_chunk_num, SplitKV) + + split_context_start = (pid_split_kv * split_context_chunk_num) * ChunkK + split_context_length = min( + context_length - split_context_start, split_context_chunk_num * ChunkK + ) + + if split_context_length <= 0: + return + + residual_context = (ChunkK - split_context_length % ChunkK) % ChunkK + + NumWarps: gl.constexpr = 4 + ThreadsPerWarp: gl.constexpr = 64 + + # ===--------------------------------------------------- + # Gluon Layout + # ===--------------------------------------------------- + ValQMPerThread: gl.constexpr = ChunkQ // ( + NumWarps * ThreadsPerWarp // (HiddenDim // 16) + ) + layout_q: gl.constexpr = gl.BlockedLayout( + size_per_thread=[ValQMPerThread, 16], # q type is fp8 (E4M3) + threads_per_warp=[ThreadsPerWarp // (HiddenDim // 16), HiddenDim // 16], + warps_per_cta=[NumWarps, 1], + order=[1, 0], + ) + + ValKNPerThread: gl.constexpr = ChunkK // ( + NumWarps * ThreadsPerWarp // (HiddenDim // 16) + ) + layout_kv: gl.constexpr = gl.BlockedLayout( + size_per_thread=[ValKNPerThread, 16], # k type is fp8 (E4M3) + threads_per_warp=[ThreadsPerWarp // (HiddenDim // 16), HiddenDim // 16], + warps_per_cta=[NumWarps, 1], + order=[1, 0], + ) + + mfma_layout: gl.constexpr = gl.amd.AMDMFMALayout( + version=4, + instr_shape=[16, 16], + transposed=False, + warps_per_cta=[1, NumWarps], + ) + mfma_layout_a: gl.constexpr = gl.DotOperandLayout( + operand_index=0, parent=mfma_layout, k_width=16 + ) + mfma_layout_b: gl.constexpr = gl.DotOperandLayout( + operand_index=1, parent=mfma_layout, k_width=16 + ) + + layout_scale: gl.constexpr = gl.SliceLayout(1, mfma_layout) + + # ===--------------------------------------------------- + # Pipeline Start + # ===--------------------------------------------------- + q = gl.amd.cdna3.buffer_load( + ptr=Q_buffer, + offsets=pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ( + ( + pid_q_head * ChunkQ + + gl.arange(0, ChunkQ, layout=gl.SliceLayout(1, layout_q)) + ) + * stride_q_heads + )[:, None] + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(0, layout_q))[None, :], + ) + scale_weight = gl.amd.cdna3.buffer_load( + ptr=weights, + offsets=(pid_batch * next_n + pid_next_n) * stride_w_batch + + pid_q_head * ChunkQ + + gl.arange(0, ChunkQ, layout=layout_scale), + ) + + mask_kv_next = ( + split_context_start + - residual_context + + gl.arange(0, ChunkK, layout=gl.SliceLayout(1, layout_kv)) + >= 0 + ) + mask_kv_scale_next = ( + split_context_start + - residual_context + + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)) + >= 0 + ) + context_kv_idx_next = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + split_context_start + - residual_context + + gl.arange(0, ChunkK, layout=gl.SliceLayout(1, layout_kv)), + mask=mask_kv_next, + ) + context_kv_scale_idx_next = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + split_context_start + - residual_context + + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)), + mask=mask_kv_scale_next, + ) + + mfma_q = gl.convert_layout(q, mfma_layout_a) + + context_kv_idx_next = tl.where(mask_kv_next, context_kv_idx_next, 0) + k_next = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=context_kv_idx_next[:, None] * stride_k_seq + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(0, layout_kv))[None, :], + ) + context_kv_scale_idx_next = tl.where( + mask_kv_scale_next, context_kv_scale_idx_next, 0 + ) + k_scale_f_next = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, offsets=context_kv_scale_idx_next * stride_scale_seq + ) + + zero = gl.zeros((ChunkQ, ChunkK), dtype=tl.float32, layout=mfma_layout) + for context_idx in range( + split_context_start - residual_context, + split_context_start + split_context_length - ChunkK, + ChunkK, + ): + k = k_next + k_scale_f = k_scale_f_next + + context_kv_idx_next = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + context_idx + + ChunkK + + gl.arange(0, ChunkK, layout=gl.SliceLayout(1, layout_kv)), + ) + context_kv_scale_idx_next = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + context_idx + + ChunkK + + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)), + ) + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + mfma_k = gl.convert_layout(k.T, mfma_layout_b) + + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + o = o * k_scale_f[None, :] + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_next = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=context_kv_idx_next[:, None] * stride_k_seq + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(0, layout_kv))[None, :], + ) + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_scale_f_next = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, offsets=context_kv_scale_idx_next * stride_scale_seq + ) + + mask = ( + context_idx + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + o = tl.where(mask[None, :], o, float("-inf")) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)) + ), + mask=context_idx + + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)) + >= 0, + ) + + context_idx = split_context_start + split_context_length - ChunkK + k = k_next + k_scale_f = k_scale_f_next + + mfma_k = gl.convert_layout(k.T, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + + mask = ( + context_idx + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + o = tl.where(mask[None, :], o, float("-inf")) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + (context_idx + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout))), + mask=context_idx + gl.arange(0, ChunkK, layout=gl.SliceLayout(0, mfma_layout)) + >= 0, + ) + + +@gluon.jit +def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + context_len_ptr, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + max_block_len, + SplitKV, + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + KVBlockSize: tl.constexpr = 16, +): + # ===--------------------------------------------------- + # Gluon Layout + # ===--------------------------------------------------- + NumWarps: gl.constexpr = 4 + ThreadsPerWarp: gl.constexpr = 64 + + ValQMPerThread: gl.constexpr = ChunkQ // ( + NumWarps * ThreadsPerWarp // (HiddenDim // 16) + ) + layout_q: gl.constexpr = gl.BlockedLayout( + size_per_thread=[ValQMPerThread, 16], # q type is fp8 (E4M3) + threads_per_warp=[ThreadsPerWarp // (HiddenDim // 16), HiddenDim // 16], + warps_per_cta=[NumWarps, 1], + order=[1, 0], + ) + + ChunkKPerStage: gl.constexpr = ChunkK // 2 + MFMAPerWarp: gl.constexpr = ChunkKPerStage // 16 // NumWarps + + if _Use_2d_instr_shape_mfma_layout: + mfma_layout: gl.constexpr = gl.amd.AMDMFMALayout( + version=4, + instr_shape=[16, 16], + transposed=False, + warps_per_cta=[1, NumWarps], + tiles_per_warp=[1, MFMAPerWarp], + ) + else: + mfma_layout: gl.constexpr = gl.amd.AMDMFMALayout( + version=4, + instr_shape=[16, 16, 32], + transposed=False, + warps_per_cta=[1, NumWarps], + tiles_per_warp=[1, MFMAPerWarp], + ) + + mfma_layout_a: gl.constexpr = gl.DotOperandLayout( + operand_index=0, parent=mfma_layout, k_width=16 + ) + mfma_layout_b: gl.constexpr = gl.DotOperandLayout( + operand_index=1, parent=mfma_layout, k_width=16 + ) + + layout_scale: gl.constexpr = gl.SliceLayout(1, mfma_layout) + + ContextBlockPerChunkK: gl.constexpr = ChunkK // KVBlockSize + ChunkKStagePerContextBlock: gl.constexpr = KVBlockSize // ChunkKPerStage + + LoadBlockIndiceForEachStage: gl.constexpr = ChunkKPerStage % KVBlockSize == 0 + + # DS_WRITE: gl.constexpr = 0x200 + DS_READ: gl.constexpr = 0x100 + BUFFER_LOAD: gl.constexpr = 0x020 + MFMA: gl.constexpr = 0x008 + # VALU: gl.constexpr = 0x002 + + # ===--------------------------------------------------- + # Mapping WorkTile + # ===--------------------------------------------------- + pid = tl.program_id(0) + + # ===--------------------------------------------------- + pid_batch, remain_pid = pid % batch_size, pid // batch_size + pid_next_n, pid_split_kv = remain_pid % next_n, remain_pid // next_n + # ===--------------------------------------------------- + context_length = gl.load(context_len_ptr + pid_batch) + + context_chunk_num = tl.cdiv(context_length, ChunkK) + split_context_chunk_num = context_chunk_num // SplitKV + residual_context_chunks = context_chunk_num % SplitKV + split_context_start = ( + pid_split_kv * split_context_chunk_num * ChunkK + + min(pid_split_kv, residual_context_chunks) * ChunkK + ) + split_context_length = min( + context_length - split_context_start, + split_context_chunk_num * ChunkK + + (ChunkK if pid_split_kv < residual_context_chunks else 0), + ) + + if split_context_length <= 0: + return + + if LoadBlockIndiceForEachStage: + split_context_block = tl.cdiv(split_context_length, KVBlockSize) + split_context_length = split_context_block * KVBlockSize + + residual_context_blocks = ( + ContextBlockPerChunkK - split_context_block % ContextBlockPerChunkK + ) % ContextBlockPerChunkK + residual_context = residual_context_blocks * KVBlockSize + + # ===--------------------------------------------------- + # Pipeline Start + _amd_iglp_sched_barrier(0x0) + # ===--------------------------------------------------- + q = gl.amd.cdna3.buffer_load( + ptr=Q_buffer, + offsets=pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ( + gl.arange(0, ChunkQ, layout=gl.SliceLayout(1, layout_q)) + * stride_q_heads + )[:, None] + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(0, layout_q))[None, :], + ) + + context_idx = split_context_start - residual_context + + mask_kv_next_0 = ( + context_idx // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize + ) >= split_context_start // KVBlockSize + context_kv_idx_next_0 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + context_idx // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + mask=mask_kv_next_0, + ) + + mask_kv_next_1 = ( + (context_idx + ChunkKPerStage) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize + ) >= split_context_start // KVBlockSize + context_kv_idx_next_1 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkKPerStage) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + mask=mask_kv_next_1, + ) + + scale_weight = gl.amd.cdna3.buffer_load( + ptr=weights, + offsets=(pid_batch * next_n + pid_next_n) * stride_w_batch + + gl.arange(0, ChunkQ, layout=layout_scale), + ) + + offset_k_fixed = ( + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) % 16 + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) + // 16 + * 256 + )[:, None] + ( + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % 16 + * 16 + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize + // 16 + * 16 + * 128 + )[ + None, : + ] + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + mfma_q = gl.convert_layout(q, mfma_layout_a) + + context_kv_idx_next_0 = tl.where(mask_kv_next_0, context_kv_idx_next_0, 0) + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_0[None, :] * stride_k_seq, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + + _amd_iglp_sched_group_barrier(DS_READ, 4, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 4, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + + if context_idx + ChunkK < split_context_start + split_context_length: + context_kv_idx_next_0 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkK) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + ) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + # ===--------------------------------------------------- + # Precompute First Iteration + # ===--------------------------------------------------- + zero = gl.zeros((ChunkQ, ChunkKPerStage), dtype=tl.float32, layout=mfma_layout) + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + context_kv_idx_next_1 = tl.where(mask_kv_next_1, context_kv_idx_next_1, 0) + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_1[None, :] * stride_k_seq, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_1 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + mask=context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + >= split_context_start, + ) + + for context_idx in range( + split_context_start - residual_context, + split_context_start + split_context_length - ChunkK, + ChunkK, + ): + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + context_kv_idx_next_1 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkK + ChunkKPerStage) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + ) + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_0[None, :] * stride_k_seq, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + mask=context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + >= split_context_start, + ) + + # ======================================================================================= + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + # #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + # #!=---------------------------- + if ( + context_idx + ChunkK + ChunkK + < split_context_start + split_context_length + ): + context_kv_idx_next_0 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkK + ChunkK) // KVBlockSize + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b) + ) + // KVBlockSize, + ) + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_1[None, :] * stride_k_seq, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_1 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + _amd_s_set_prio(2) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + ) + + context_idx = split_context_start + split_context_length - ChunkK + + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + _amd_s_set_prio(1) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + mask = ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + mask=context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + >= split_context_start, + ) + else: + context_idx = split_context_start + current_chunk_rank = context_idx // ChunkKPerStage % ChunkKStagePerContextBlock + block_idx = context_idx // KVBlockSize + batch_blocks = tl.cdiv(context_length, KVBlockSize) + + q = gl.amd.cdna3.buffer_load( + ptr=Q_buffer, + offsets=pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ( + gl.arange(0, ChunkQ, layout=gl.SliceLayout(1, layout_q)) + * stride_q_heads + )[:, None] + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(0, layout_q))[None, :], + ) + + context_kv_idx_next_0 = gl.load( + kv_indices + pid_batch * max_block_len + block_idx + ) + block_idx += 1 + + context_kv_idx_next_1 = gl.load( + kv_indices + pid_batch * max_block_len + block_idx, + mask=block_idx < batch_blocks, + ) + block_idx += 1 + + scale_weight = gl.amd.cdna3.buffer_load( + ptr=weights, + offsets=(pid_batch * next_n + pid_next_n) * stride_w_batch + + gl.arange(0, ChunkQ, layout=layout_scale), + ) + + offset_k_fixed = ( + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) % 16 + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) + // 16 + * 256 + )[:, None] + ( + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % 16 + * 16 + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize + // 16 + * 16 + * 128 + )[ + None, : + ] + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + mfma_q = gl.convert_layout(q, mfma_layout_a) + + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + context_idx % KVBlockSize * HiddenDim, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + ) + % KVBlockSize, + ) + + _amd_iglp_sched_group_barrier(DS_READ, 4, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 4, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + # ===--------------------------------------------------- + # Precompute First Iteration + # ===--------------------------------------------------- + zero = gl.zeros((ChunkQ, ChunkKPerStage), dtype=tl.float32, layout=mfma_layout) + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + (context_idx + ChunkKPerStage) % KVBlockSize * HiddenDim, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + ) + % KVBlockSize, + ) + current_chunk_rank += 2 + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + mask = ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + ) + + for context_idx_ in range( + split_context_start, + split_context_start + split_context_length - ChunkK, + ChunkK, + ): + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + if current_chunk_rank == ChunkKStagePerContextBlock: + current_chunk_rank = 0 + context_kv_idx_next_0 = context_kv_idx_next_1 + context_kv_idx_next_1 = gl.load( + kv_indices + pid_batch * max_block_len + block_idx, + mask=block_idx < batch_blocks, + ) + block_idx += 1 + + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + (context_idx_ + ChunkK) % KVBlockSize * HiddenDim, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx_ + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b) + ) + ) + % KVBlockSize, + ) + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx_ + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + ) + + # ======================================================================================= + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + (context_idx_ + ChunkK + ChunkKPerStage) % KVBlockSize * HiddenDim, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx_ + + ChunkK + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b) + ) + ) + % KVBlockSize, + ) + current_chunk_rank += 2 + + _amd_s_set_prio(2) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + mask = ( + context_idx_ + + ChunkK + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx_ + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + ) + context_idx = context_idx_ + ChunkK + + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + _amd_s_set_prio(1) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + mask = ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + ) + + +@gluon.jit +def _gluon_deepgemm_fp8_paged_mqa_logits_preshuffle_varctx( + batch_size, + next_n, + heads_num, + Q_buffer, + stride_q_batch, + stride_q_next_n, + stride_q_heads, + KV_buffer, + stride_k_seq, + scale_buffer, + stride_scale_seq, + context_len_ptr, + kv_indices, + weights, + stride_w_batch, + OutLogits_buffer, + stride_out_batch, + max_model_len, + max_block_len, + safe_chunks_per_cta_ptr, + ChunkQ: tl.constexpr, + ChunkK: tl.constexpr, + HiddenDim: tl.constexpr, + KVBlockSize: tl.constexpr = 16, +): + # ===--------------------------------------------------- + # Gluon Layout + # ===--------------------------------------------------- + NumWarps: gl.constexpr = 4 + ThreadsPerWarp: gl.constexpr = 64 + + ValQMPerThread: gl.constexpr = ChunkQ // ( + NumWarps * ThreadsPerWarp // (HiddenDim // 16) + ) + layout_q: gl.constexpr = gl.BlockedLayout( + size_per_thread=[ValQMPerThread, 16], # q type is fp8 (E4M3) + threads_per_warp=[ThreadsPerWarp // (HiddenDim // 16), HiddenDim // 16], + warps_per_cta=[NumWarps, 1], + order=[1, 0], + ) + + ChunkKPerStage: gl.constexpr = ChunkK // 2 + MFMAPerWarp: gl.constexpr = ChunkKPerStage // 16 // NumWarps + + if _Use_2d_instr_shape_mfma_layout: + mfma_layout: gl.constexpr = gl.amd.AMDMFMALayout( + version=4, + instr_shape=[16, 16], + transposed=False, + warps_per_cta=[1, NumWarps], + tiles_per_warp=[1, MFMAPerWarp], + ) + else: + mfma_layout: gl.constexpr = gl.amd.AMDMFMALayout( + version=4, + instr_shape=[16, 16, 32], + transposed=False, + warps_per_cta=[1, NumWarps], + tiles_per_warp=[1, MFMAPerWarp], + ) + + mfma_layout_a: gl.constexpr = gl.DotOperandLayout( + operand_index=0, parent=mfma_layout, k_width=16 + ) + mfma_layout_b: gl.constexpr = gl.DotOperandLayout( + operand_index=1, parent=mfma_layout, k_width=16 + ) + + layout_scale: gl.constexpr = gl.SliceLayout(1, mfma_layout) + + ContextBlockPerChunkK: gl.constexpr = ChunkK // KVBlockSize + ChunkKStagePerContextBlock: gl.constexpr = KVBlockSize // ChunkKPerStage + + LoadBlockIndiceForEachStage: gl.constexpr = ChunkKPerStage % KVBlockSize == 0 + + # DS_WRITE: gl.constexpr = 0x200 + DS_READ: gl.constexpr = 0x100 + BUFFER_LOAD: gl.constexpr = 0x020 + MFMA: gl.constexpr = 0x008 + # VALU: gl.constexpr = 0x002 + + # ===--------------------------------------------------- + # Mapping WorkTile + # ===--------------------------------------------------- + pid = tl.program_id(0) + + pid_split_kv = pid + safe_chunks_per_cta = gl.load(safe_chunks_per_cta_ptr) + + pid_batch = 0 + context_length = gl.load(context_len_ptr + pid_batch) + + cur_batch_chunk_num = tl.cdiv(context_length, ChunkK) + cur_batch_cta_count = tl.cdiv(cur_batch_chunk_num, safe_chunks_per_cta) + + while pid_split_kv >= cur_batch_cta_count * next_n and cur_batch_cta_count > 0: + pid_split_kv -= cur_batch_cta_count * next_n + pid_batch += 1 + context_length = gl.load( + context_len_ptr + pid_batch, mask=pid_batch < batch_size, other=0 + ) + cur_batch_chunk_num = tl.cdiv(context_length, ChunkK) + cur_batch_cta_count = tl.cdiv(cur_batch_chunk_num, safe_chunks_per_cta) + + if context_length == 0: + return + + pid_next_n = pid_split_kv % next_n + pid_split_kv = pid_split_kv // next_n + + split_context_chunk_num = cur_batch_chunk_num // cur_batch_cta_count + residual_context_chunks = cur_batch_chunk_num % cur_batch_cta_count + split_context_start = ( + pid_split_kv * split_context_chunk_num * ChunkK + + min(pid_split_kv, residual_context_chunks) * ChunkK + ) + split_context_length = min( + context_length - split_context_start, + split_context_chunk_num * ChunkK + + (ChunkK if pid_split_kv < residual_context_chunks else 0), + ) + + if split_context_length <= 0: + return + + if LoadBlockIndiceForEachStage: + split_context_block = tl.cdiv(split_context_length, KVBlockSize) + split_context_length = split_context_block * KVBlockSize + + residual_context_blocks = ( + ContextBlockPerChunkK - split_context_block % ContextBlockPerChunkK + ) % ContextBlockPerChunkK + residual_context = residual_context_blocks * KVBlockSize + + # ===--------------------------------------------------- + # Pipeline Start + _amd_iglp_sched_barrier(0x0) + # ===--------------------------------------------------- + q = gl.amd.cdna3.buffer_load( + ptr=Q_buffer, + offsets=pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ( + gl.arange(0, ChunkQ, layout=gl.SliceLayout(1, layout_q)) + * stride_q_heads + )[:, None] + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(0, layout_q))[None, :], + ) + + context_idx = split_context_start - residual_context + + mask_kv_next_0 = ( + context_idx // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize + ) >= split_context_start // KVBlockSize + context_kv_idx_next_0 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + context_idx // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + mask=mask_kv_next_0, + ) + + mask_kv_next_1 = ( + (context_idx + ChunkKPerStage) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize + ) >= split_context_start // KVBlockSize + context_kv_idx_next_1 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkKPerStage) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + mask=mask_kv_next_1, + ) + + scale_weight = gl.amd.cdna3.buffer_load( + ptr=weights, + offsets=(pid_batch * next_n + pid_next_n) * stride_w_batch + + gl.arange(0, ChunkQ, layout=layout_scale), + ) + + offset_k_fixed = ( + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) % 16 + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) + // 16 + * 256 + )[:, None] + ( + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % 16 + * 16 + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize + // 16 + * 16 + * 128 + )[ + None, : + ] + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + mfma_q = gl.convert_layout(q, mfma_layout_a) + + context_kv_idx_next_0 = tl.where(mask_kv_next_0, context_kv_idx_next_0, 0) + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_0[None, :] * stride_k_seq, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + + _amd_iglp_sched_group_barrier(DS_READ, 4, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 4, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + + if context_idx + ChunkK < split_context_start + split_context_length: + context_kv_idx_next_0 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkK) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + ) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + # ===--------------------------------------------------- + # Precompute First Iteration + # ===--------------------------------------------------- + zero = gl.zeros((ChunkQ, ChunkKPerStage), dtype=tl.float32, layout=mfma_layout) + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + context_kv_idx_next_1 = tl.where(mask_kv_next_1, context_kv_idx_next_1, 0) + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_1[None, :] * stride_k_seq, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_1 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + mask=context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + >= split_context_start, + ) + + for context_idx in range( + split_context_start - residual_context, + split_context_start + split_context_length - ChunkK, + ChunkK, + ): + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + context_kv_idx_next_1 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkK + ChunkKPerStage) // KVBlockSize + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + // KVBlockSize, + ) + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_0[None, :] * stride_k_seq, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + mask=context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + >= split_context_start, + ) + + # ======================================================================================= + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + # #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + # #!=---------------------------- + if ( + context_idx + ChunkK + ChunkK + < split_context_start + split_context_length + ): + context_kv_idx_next_0 = gl.amd.cdna3.buffer_load( + ptr=kv_indices, + offsets=pid_batch * max_block_len + + (context_idx + ChunkK + ChunkK) // KVBlockSize + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b) + ) + // KVBlockSize, + ) + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + context_kv_idx_next_1[None, :] * stride_k_seq, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_1 * stride_scale_seq + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize, + ) + _amd_s_set_prio(2) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + ) + + context_idx = split_context_start + split_context_length - ChunkK + + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + _amd_s_set_prio(1) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + mask = ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + mask=context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + >= split_context_start, + ) + else: + context_idx = split_context_start + current_chunk_rank = context_idx // ChunkKPerStage % ChunkKStagePerContextBlock + block_idx = context_idx // KVBlockSize + batch_blocks = tl.cdiv(context_length, KVBlockSize) + + q = gl.amd.cdna3.buffer_load( + ptr=Q_buffer, + offsets=pid_batch * stride_q_batch + + pid_next_n * stride_q_next_n + + ( + gl.arange(0, ChunkQ, layout=gl.SliceLayout(1, layout_q)) + * stride_q_heads + )[:, None] + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(0, layout_q))[None, :], + ) + + context_kv_idx_next_0 = gl.load( + kv_indices + pid_batch * max_block_len + block_idx + ) + block_idx += 1 + + context_kv_idx_next_1 = gl.load( + kv_indices + pid_batch * max_block_len + block_idx, + mask=block_idx < batch_blocks, + ) + block_idx += 1 + + scale_weight = gl.amd.cdna3.buffer_load( + ptr=weights, + offsets=(pid_batch * next_n + pid_next_n) * stride_w_batch + + gl.arange(0, ChunkQ, layout=layout_scale), + ) + + offset_k_fixed = ( + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) % 16 + + gl.arange(0, HiddenDim, layout=gl.SliceLayout(1, mfma_layout_b)) + // 16 + * 256 + )[:, None] + ( + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % 16 + * 16 + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + % KVBlockSize + // 16 + * 16 + * 128 + )[ + None, : + ] + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + mfma_q = gl.convert_layout(q, mfma_layout_a) + + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + context_idx % KVBlockSize * HiddenDim, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + ) + % KVBlockSize, + ) + + _amd_iglp_sched_group_barrier(DS_READ, 4, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 4, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(DS_READ, 2, 0) + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + # ===--------------------------------------------------- + # Precompute First Iteration + # ===--------------------------------------------------- + zero = gl.zeros((ChunkQ, ChunkKPerStage), dtype=tl.float32, layout=mfma_layout) + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + (context_idx + ChunkKPerStage) % KVBlockSize * HiddenDim, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b)) + ) + % KVBlockSize, + ) + current_chunk_rank += 2 + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + mask = ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + ) + + for context_idx_ in range( + split_context_start, + split_context_start + split_context_length - ChunkK, + ChunkK, + ): + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + if current_chunk_rank == ChunkKStagePerContextBlock: + current_chunk_rank = 0 + context_kv_idx_next_0 = context_kv_idx_next_1 + context_kv_idx_next_1 = gl.load( + kv_indices + pid_batch * max_block_len + block_idx, + mask=block_idx < batch_blocks, + ) + block_idx += 1 + + k_next_0 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + (context_idx_ + ChunkK) % KVBlockSize * HiddenDim, + ) + k_scale_f_next_0 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx_ + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b) + ) + ) + % KVBlockSize, + ) + + _amd_s_set_prio(3) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(1) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx_ + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + ) + + # ======================================================================================= + + k = k_next_0 + k_scale_f = k_scale_f_next_0 + + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + k_next_1 = gl.amd.cdna3.buffer_load( + ptr=KV_buffer, + offsets=offset_k_fixed + + context_kv_idx_next_0 * stride_k_seq + + (context_idx_ + ChunkK + ChunkKPerStage) % KVBlockSize * HiddenDim, + ) + k_scale_f_next_1 = gl.amd.cdna3.buffer_load( + ptr=scale_buffer, + offsets=context_kv_idx_next_0 * stride_scale_seq + + ( + context_idx_ + + ChunkK + + ChunkKPerStage + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout_b) + ) + ) + % KVBlockSize, + ) + current_chunk_rank += 2 + + _amd_s_set_prio(2) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + _amd_iglp_sched_group_barrier(BUFFER_LOAD, 2, 0) + _amd_iglp_sched_group_barrier(MFMA, 8, 0) + #!=---------------------------- + _amd_iglp_sched_barrier(0x0) + #!=---------------------------- + + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + mask = ( + context_idx_ + + ChunkK + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx_ + + ChunkK + + gl.arange( + 0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout) + ) + ), + ) + context_idx = context_idx_ + ChunkK + + k = k_next_1 + k_scale_f = k_scale_f_next_1 + + _amd_s_set_prio(1) + mfma_k = gl.convert_layout(k, mfma_layout_b) + o = gl.amd.cdna3.mfma(mfma_q, mfma_k, zero) + k_scale_f = gl.convert_layout(k_scale_f, gl.SliceLayout(0, mfma_layout)) + o = o * k_scale_f[None, :] + o = gl.maximum(o, 0.0) + o = o * scale_weight[:, None] + _amd_s_set_prio(0) + + mask = ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + <= context_length - next_n + pid_next_n + ) + + logits = gl.reduce(o, axis=0, combine_fn=_sum_combine) + logits = tl.where(mask, logits, float("-inf")) + gl.amd.cdna3.buffer_store( + logits, + ptr=OutLogits_buffer, + offsets=(pid_batch * next_n + pid_next_n) * stride_out_batch + + ( + context_idx + + ChunkKPerStage + + gl.arange(0, ChunkKPerStage, layout=gl.SliceLayout(0, mfma_layout)) + ), + ) diff --git a/aiter/ops/triton/group_quant_int8.py b/aiter/ops/triton/group_quant_int8.py new file mode 100644 index 0000000000000000000000000000000000000000..59fd2ef22f14c10726680457c0a827d6a4a42ebc --- /dev/null +++ b/aiter/ops/triton/group_quant_int8.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py +import functools +import json +import os +from typing import Any, Dict, List, Optional, Tuple + +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import aiter.ops.triton.utils.arch_info as arch_info +from aiter import logger + +import torch +import triton +import triton.language as tl + +''' +configs = [ + triton.Config({"BLOCK_SIZE": BLOCK_SIZE}, num_warps=num_warps, num_stages=num_stages) + for BLOCK_SIZE in [2**n for n in range(7, 15)] for num_warps in [1, 2, 4, 8, 16] for num_stages in [1, 2]\ +] +@triton.autotune( + configs=configs, + key=["M", "GROUP_SIZE"], + perf_debug=True, + enable=int(os.getenv("TRITON_DO_AUTOTUNING", 0)) == 1, + config_hook_params=ConfigHookParams( + cache=JsonCache(lambda key: get_w8a8_group_quant_config_filepath(**key)), # Save best config to this file + key_hook=MKeyHook(), # Use "M" as the key + #extra_config_hook=None + ), + #prune_configs_by={ + # "early_config_prune": prune_configs + #} +) +''' +@triton.jit +def _per_token_group_quant_int8( + # Pointers to inputs and output + y_ptr, + y_q_ptr, + y_s_ptr, + M, + # Avoid to divide zero + eps, + int8_min, + int8_max, + GROUP_SIZE: tl.constexpr, + BLOCK_SIZE: tl.constexpr +): + """A Triton-accelerated function to perform per-token-group + quantization on a tensor. + + This function converts the tensor values into int8 values. + """ + g_id = tl.program_id(0) + y_ptr += g_id * BLOCK_SIZE + y_q_ptr += g_id * BLOCK_SIZE + S_NUM: tl.constexpr = BLOCK_SIZE // GROUP_SIZE + y_s_ptr += g_id * S_NUM + + cols = tl.arange(0, BLOCK_SIZE) # N <= BLOCK_SIZE + s_cols = tl.arange(0, S_NUM) + mask = g_id * BLOCK_SIZE + cols < M + + y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32) + y = tl.reshape(y, (S_NUM, GROUP_SIZE)) + # Quant + _absmax = tl.maximum(tl.max(tl.abs(y), axis=1), eps) + y_s = (_absmax / int8_max).reshape(S_NUM, 1) + y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty) + + y_q = tl.reshape(y_q, (S_NUM * GROUP_SIZE)) + y_s = tl.reshape(y_s, (S_NUM)) + + tl.store(y_q_ptr + cols, y_q, mask=mask) + tl.store(y_s_ptr + s_cols, y_s.to(y_s_ptr.dtype.element_ty)) + +@functools.lru_cache +def get_w8a8_group_quant_config_filepath(M: int, GROUP_SIZE: int) -> str: + device_name = arch_info.get_device() + if device_name.lower().startswith("bw"): + device_name = "BW200" + if "k100" in device_name.lower(): + device_name = "K100_AI" + json_file_name = f"w8a8_per_token_group_quant_device_name={device_name},group_size={GROUP_SIZE}.json" + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "group_quant", json_file_name + ) + return config_file_path + +@functools.lru_cache +def get_w8a8_group_quant_configs( + M: int, groupSize: int +) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + config_file_path = get_w8a8_group_quant_config_filepath(M, groupSize) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + #logger.info( + # "Using configuration from %s for W8A8 GROUP QUANT kernel.", + # config_file_path, + #) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ( + "Using default W8A8 GROUP QUANT kernel config. Performance might " + "be sub-optimal! Config file not found at %s" + ), + config_file_path, + ) + return None + +def per_token_group_quant_int8( + x: torch.Tensor, + group_size: int, + eps: float = 1e-10, + dtype: torch.dtype = torch.int8, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Function to perform per-token-group quantization on an input tensor `x`. + + It converts the tensor values into signed int8 values and returns the + quantized tensor along with the scaling factor used for quantization. + + Args: + x: The input tenosr with ndim >= 2. + group_size: The group size used for quantization. + eps: The minimum to avoid dividing zero. + dtype: The dype of output tensor. Note that only `torch.int8` + is supported for now. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the + scaling factor for quantization. + """ + assert (x.shape[-1] % group_size == 0 + ), "the last dimension of `x` cannot be divisible by `group_size`" + assert x.is_contiguous(), "`x` is not contiguous" + + iinfo = torch.iinfo(dtype) + int8_max = iinfo.max + int8_min = iinfo.min + + x_q = torch.empty_like(x, device=x.device, dtype=dtype) + x_s = torch.empty( + x.shape[:-1] + (x.shape[-1] // group_size, ), + device=x.device, + dtype=torch.float32, + ) + + M = x.numel() + configs = get_w8a8_group_quant_configs(M, group_size) + if configs: + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + config = { + "BLOCK_SIZE": 128, + "num_warps": 1, + "num_stages": 1, + } + + grid = lambda META: ( + triton.cdiv(M, META['BLOCK_SIZE']), + ) + _per_token_group_quant_int8[grid]( + x, + x_q, + x_s, + M, + eps, + int8_min=int8_min, + int8_max=int8_max, + GROUP_SIZE=group_size, + **config + ) + + return x_q, x_s \ No newline at end of file diff --git a/aiter/ops/triton/grouped_decode_attention.py b/aiter/ops/triton/grouped_decode_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..e680739d8b490e50d877a78753101e4c694a122b --- /dev/null +++ b/aiter/ops/triton/grouped_decode_attention.py @@ -0,0 +1,850 @@ +# Adapted from +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py + +import os +import json +import logging +import functools +from typing import Optional + +import torch +import triton +import triton.language as tl +from triton.knobs import cache as cache_knob + +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + +_is_hip = True + +logger = logging.getLogger(__name__) + +# TODO: Remove this when triton>=3.2.0. This issue will not affect performance and accuracy. +logger.warning( + "The following error message 'operation scheduled before its operands' can be ignored." +) + + +_MIN_BLOCK_KV = 32 + + +@triton.jit +def tanh(x): + # Tanh is just a scaled sigmoid + return 2 * tl.sigmoid(2 * x) - 1 + + +@triton.jit +def _fwd_kernel_stage1( + Q, + K_Buffer, + V_Buffer, + sm_scale, + kv_indptr, + kv_indices, + Att_Out, + Att_Lse, + num_kv_splits, + stride_qbs, + stride_qh, + stride_buf_kbs, + stride_buf_kh, + stride_buf_vbs, + stride_buf_vh, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + kv_group_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DV: tl.constexpr, + BLOCK_N: tl.constexpr, + MIN_BLOCK_KV: tl.constexpr, + logit_cap: tl.constexpr, + Lk: tl.constexpr, + Lv: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + split_kv_id = tl.program_id(2) + + cur_kv_head = cur_head // kv_group_num + + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_dv = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lk + mask_dv = offs_dv < Lv + + cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch) + cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx + kv_splits = tl.load(num_kv_splits + cur_batch) + + off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d + + kv_len_per_split = ( + tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV + ) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + e_max = -float("inf") + e_sum = 0.0 + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + if split_kv_end > split_kv_start: + q = tl.load(Q + off_q, mask=mask_d, other=0.0) + for start_n in range(split_kv_start, split_kv_end, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + kv_loc = tl.load( + kv_indices + cur_batch_kv_start_idx + offs_n, + mask=offs_n < split_kv_end, + other=0, + ) + offs_buf_k = ( + kv_loc[:, None] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_d[None, :] + ) + k = tl.load( + K_Buffer + offs_buf_k, + mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]), + other=0.0, + ) + qk = tl.sum(q[None, :] * k, 1) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * tanh(qk / logit_cap) + + qk = tl.where(offs_n < split_kv_end, qk, float("-inf")) + + offs_buf_v = ( + kv_loc[:, None] * stride_buf_vbs + + cur_kv_head * stride_buf_vh + + offs_dv[None, :] + ) + v = tl.load( + V_Buffer + offs_buf_v, + mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]), + other=0.0, + ) + + n_e_max = tl.maximum(tl.max(qk, 0), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max) + acc *= re_scale + acc += tl.sum(p[:, None] * v, 0) + + e_sum = e_sum * re_scale + tl.sum(p, 0) + e_max = n_e_max + + offs_mid_o = ( + cur_batch * stride_mid_ob + + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + + offs_dv + ) + + tl.store( + Att_Out + offs_mid_o, + acc / e_sum, + mask=(mask_dv), + ) + + offs_mid_o_1 = ( + cur_batch * stride_mid_ob + + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + ) // Lv + + tl.store( + Att_Lse + offs_mid_o_1, + e_max + tl.log(e_sum), + ) + + +def _decode_att_m_fwd( + q, + k_buffer, + v_buffer, + att_out, + att_lse, + kv_indptr, + kv_indices, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap, +): + BLOCK = 64 + # [TODO] work around SGPR limit on MI3xx + if _is_hip: + BLOCK = 8 + MAX_KV_SPLITS = max_kv_splits + Lk = k_buffer.shape[-1] + Lv = v_buffer.shape[-1] + + batch, head_num = kv_indptr.shape[0] - 1, q.shape[1] + + grid = (batch, head_num, MAX_KV_SPLITS) + kv_group_num = q.shape[1] // k_buffer.shape[1] + + if kv_group_num == 1: + num_warps = 4 + else: + num_warps = 2 + if _is_hip: + num_warps = 1 + + BLOCK_DMODEL = triton.next_power_of_2(Lk) + BLOCK_DV = triton.next_power_of_2(Lv) + + _fwd_kernel_stage1[grid]( + q, + k_buffer, + v_buffer, + sm_scale, + kv_indptr, + kv_indices, + att_out, + att_lse, + num_kv_splits, + q.stride(0), + q.stride(1), + k_buffer.stride(0), + k_buffer.stride(1), + v_buffer.stride(0), + v_buffer.stride(1), + att_out.stride(0), + att_out.stride(1), + att_out.stride(2), + kv_group_num=kv_group_num, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DV=BLOCK_DV, + BLOCK_N=BLOCK, + MIN_BLOCK_KV=_MIN_BLOCK_KV, + logit_cap=logit_cap, + num_warps=num_warps, + num_stages=2, + Lk=Lk, + Lv=Lv, + ) + + +@triton.jit +def _fwd_grouped_kernel_stage1( + Q, + K_Buffer, + V_Buffer, + sm_scale, + kv_indptr, + kv_indices, + Att_Out, + Att_Lse, + num_kv_splits, + stride_qbs, + stride_qh, + stride_buf_kbs, + stride_buf_kh, + stride_buf_vbs, + stride_buf_vh, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + kv_group_num: tl.constexpr, + q_head_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DPE: tl.constexpr, + BLOCK_DV: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_H: tl.constexpr, + MIN_BLOCK_KV: tl.constexpr, + logit_cap: tl.constexpr, + Lk: tl.constexpr, + Lv: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head_id = tl.program_id(1) + cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H) + split_kv_id = tl.program_id(2) + + if BLOCK_H < kv_group_num: + VALID_BLOCK_H: tl.constexpr = BLOCK_H + else: + VALID_BLOCK_H: tl.constexpr = kv_group_num + cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H) + mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H + mask_h = mask_h & (cur_head < q_head_num) + + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_dv = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lk + mask_dv = offs_dv < Lv + + cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch) + cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx + kv_splits = tl.load(num_kv_splits + cur_batch) + + offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :] + + if BLOCK_DPE > 0: + offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE) + mask_dpe = offs_dpe < Lk + off_qpe = ( + cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :] + ) + + kv_len_per_split = ( + tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV + ) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf") + e_sum = tl.zeros([BLOCK_H], dtype=tl.float32) + acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32) + + if split_kv_end > split_kv_start: + q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_d[None, :]), other=0.0) + if BLOCK_DPE > 0: + qpe = tl.load( + Q + off_qpe, mask=(mask_h[:, None]) & (mask_dpe[None, :]), other=0.0 + ) + for start_n in range(split_kv_start, split_kv_end, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + kv_loc = tl.load( + kv_indices + cur_batch_kv_start_idx + offs_n, + mask=offs_n < split_kv_end, + other=0, + ) + offs_buf_k = ( + kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_d[:, None] + ) + k = tl.load( + K_Buffer + offs_buf_k, + mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]), + other=0.0, + ) + qk = tl.dot(q, k.to(q.dtype)) + if BLOCK_DPE > 0: + offs_buf_kpe = ( + kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_dpe[:, None] + ) + kpe = tl.load( + K_Buffer + offs_buf_kpe, + mask=(offs_n[None, :] < split_kv_end) & (mask_dpe[:, None]), + other=0.0, + ) + qk += tl.dot(qpe, kpe.to(qpe.dtype)) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * tanh(qk / logit_cap) + + qk = tl.where( + mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf") + ) + + offs_buf_v = ( + kv_loc[:, None] * stride_buf_vbs + + cur_kv_head * stride_buf_vh + + offs_dv[None, :] + ) + v = tl.load( + V_Buffer + offs_buf_v, + mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]), + other=0.0, + ) + + n_e_max = tl.maximum(tl.max(qk, 1), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max[:, None]) + acc *= re_scale[:, None] + acc += tl.dot(p.to(v.dtype), v) + + e_sum = e_sum * re_scale + tl.sum(p, 1) + e_max = n_e_max + + offs_mid_o = ( + cur_batch * stride_mid_ob + + cur_head[:, None] * stride_mid_oh + + split_kv_id * stride_mid_os + + offs_dv[None, :] + ) + + tl.store( + Att_Out + offs_mid_o, + acc / e_sum[:, None], + mask=(mask_h[:, None]) & (mask_dv[None, :]), + ) + + offs_mid_o_1 = ( + cur_batch * stride_mid_ob + + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + ) // Lv + + tl.store( + Att_Lse + offs_mid_o_1, + e_max + tl.log(e_sum), + mask=mask_h, + ) + + +def create_tuple(k): + if k[0] != '(' and k[-1] != ')': + return k + + s = k[1:-1] + entries = s.split(", ") + ret = [] + for e in entries: + if e[0] == "'" or e[0] == '"': + ret.append(e[1:-1]) + else: + ret.append(eval(e)) + ret_t = tuple(ret) + return ret_t + + +def _load_config(path): + with open(path, "r") as file: + data = json.load(file) + res = {} + res['config'] = data['config'] + res['path'] = data['path'] + res['key'] = list(data['config'].keys()) + res['keys'] = [create_tuple(k) for k in res['key']] + return res + + +def load_config(): + dev = arch_info.get_device() + stage1_fpath = f"{AITER_TRITON_CONFIGS_PATH}/{dev}-GROUPED_DECODE_ATTENTION-STAGE1-FP16.json" + stage2_fpath = f"{AITER_TRITON_CONFIGS_PATH}/{dev}-GROUPED_DECODE_ATTENTION-STAGE2-FP16.json" + stage1 = _load_config(stage1_fpath) + stage2 = _load_config(stage2_fpath) + return stage1, stage2 + + +g_stage1_config, g_stage2_config = load_config() + + +def get_stage1_default_config(Lk): + config = { + "BLOCK_N": 32, + "BLOCK_H": 16, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2, + "num_warps": 4, + "num_stages": 1 + } + if Lk >= 576: + config['BLOCK_N'] = 16 + return config + + +@functools.lru_cache(maxsize=1024) +def _get_stage1_config(kv_group_num, q_head_num, Lk, Lv): + idx = -1 + for i, keys in enumerate(g_stage1_config['keys']): + if keys[0] == kv_group_num and keys[1] == q_head_num and \ + keys[2] == Lk and keys[3] == Lv: + idx = i + break + + if idx < 0: + print("WARNING: optimal config not found, just use default config") + return get_stage1_default_config(Lk), None + else: + key = g_stage1_config['key'][idx] + return g_stage1_config['config'][key], g_stage1_config['path'][key] + + +def has_kernel_cache(path): + return False if not path or not os.path.isdir(f'{cache_knob.dir}/{path}') else True + + +def _decode_grouped_att_m_fwd( + q, + k_buffer, + v_buffer, + att_out, + att_lse, + kv_indptr, + kv_indices, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap, + config: Optional[dict[str, any]] = None, +): +# BLOCK = 32 + Lk = k_buffer.shape[-1] + Lv = v_buffer.shape[-1] + +# # [TODO] work around shmem limit on MI3xx +# if _is_hip and Lk >= 576: +# BLOCK = 16 + + if Lk == 576: + BLOCK_DMODEL = 512 + BLOCK_DPE = 64 + elif Lk == 288: + BLOCK_DMODEL = 256 + BLOCK_DPE = 32 + else: + BLOCK_DMODEL = triton.next_power_of_2(Lk) + BLOCK_DPE = 0 + BLOCK_DV = triton.next_power_of_2(Lv) + + batch, head_num = kv_indptr.shape[0] - 1, q.shape[1] + kv_group_num = q.shape[1] // k_buffer.shape[1] + +# BLOCK_H = 16 + MAX_KV_SPLITS = max_kv_splits + grid = lambda META: ( + batch, + triton.cdiv(head_num, min(META['BLOCK_H'], kv_group_num)), + MAX_KV_SPLITS, + ) + +# extra_kargs = {} +# num_stages = 2 +# if _is_hip: +# extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2} +# num_stages = 1 + + if config is None: + if q.dtype == torch.float16: + key = [kv_group_num, head_num, Lk, Lv] + config, path = _get_stage1_config(*key) + else: + config, path = get_stage1_default_config(Lk), None + assert config is not None, f"ERROR: Not found config for _decode_grouped_att_m_fwd()" + + fn = _fwd_grouped_kernel_stage1[grid] if not has_kernel_cache(path) \ + else functools.partial(triton.utils.run_saved_kernel, _fwd_grouped_kernel_stage1, + path, grid=grid) + + fn( + q, + k_buffer, + v_buffer, + sm_scale, + kv_indptr, + kv_indices, + att_out, + att_lse, + num_kv_splits, + q.stride(0), + q.stride(1), + k_buffer.stride(0), + k_buffer.stride(1), + v_buffer.stride(0), + v_buffer.stride(1), + att_out.stride(0), + att_out.stride(1), + att_out.stride(2), + kv_group_num=kv_group_num, + q_head_num=head_num, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DPE=BLOCK_DPE, + BLOCK_DV=BLOCK_DV, +# BLOCK_N=BLOCK, +# BLOCK_H=BLOCK_H, + MIN_BLOCK_KV=_MIN_BLOCK_KV, + logit_cap=logit_cap, +# num_warps=4, +# num_stages=num_stages, + Lk=Lk, + Lv=Lv, +# **extra_kargs, + **config, + ) + + +@triton.jit +def _fwd_kernel_stage2( + Mid_O, + Mid_O_1, + O, + kv_indptr, + num_kv_splits, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + stride_obs, + stride_oh, + MAX_KV_SPLITS: tl.constexpr, + MIN_BLOCK_KV: tl.constexpr, + BLOCK_DV: tl.constexpr, + Lv: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + + cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - tl.load( + kv_indptr + cur_batch + ) + kv_splits = tl.load(num_kv_splits + cur_batch) + + offs_d = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lv + + e_sum = 0.0 + e_max = -float("inf") + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d + offs_logic = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh) // Lv + kv_len_per_split = ( + tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV + ) + + for split_kv_id in range(0, MAX_KV_SPLITS): + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + if split_kv_end > split_kv_start: + tv = tl.load( + Mid_O + offs_v + split_kv_id * stride_mid_os, mask=mask_d, other=0.0 + ) + tlogic = tl.load(Mid_O_1 + offs_logic + split_kv_id * stride_mid_os // Lv) + n_e_max = tl.maximum(tlogic, e_max) + + old_scale = tl.exp(e_max - n_e_max) + acc *= old_scale + exp_logic = tl.exp(tlogic - n_e_max) + acc += exp_logic * tv + + e_sum = e_sum * old_scale + exp_logic + e_max = n_e_max + + tl.store( + O + cur_batch * stride_obs + cur_head * stride_oh + offs_d, + acc / e_sum, + mask=mask_d, + ) + + +def get_stage2_default_config(): + config = { + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2, + "num_warps": 4, + "num_stages": 2 + } + return config + + +@functools.lru_cache(maxsize=1024) +def _get_stage2_config(MAX_KV_SPLITS, Lv): + idx = -1 + for i, keys in enumerate(g_stage2_config['keys']): + if keys[0] == MAX_KV_SPLITS and keys[1] == Lv: + idx = i + break + + if idx < 0: + print("WARNING: optimal config not found, just use default config") + return get_stage2_default_config(), None + else: + key = g_stage2_config['key'][idx] + return g_stage2_config['config'][key], g_stage2_config['path'][key] + + +def _decode_softmax_reducev_fwd( + logits, + lse, + q, + o, + v_buffer, + kv_indptr, + num_kv_splits, + max_kv_splits, + config: Optional[dict[str, any]] = None, +): + batch, head_num = q.shape[0], q.shape[1] + Lv = v_buffer.shape[-1] + BLOCK_DV = triton.next_power_of_2(Lv) + + MAX_KV_SPLITS = max_kv_splits + +# extra_kargs = {} +# if _is_hip: +# extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2} + + if config is None: + if q.dtype == torch.float16: + key = [MAX_KV_SPLITS, Lv] + config, path = _get_stage2_config(*key) + else: + config, path = get_stage2_default_config(), None + assert config is not None, f"Warning: Not found config for _decode_grouped_att_m_fwd()" + + grid = (batch, head_num) + + fn = _fwd_kernel_stage2[grid] if not has_kernel_cache(path) \ + else functools.partial(triton.utils.run_saved_kernel, + _fwd_kernel_stage2, + path, grid=grid) + + fn( + logits, + lse, + o, + kv_indptr, + num_kv_splits, + logits.stride(0), + logits.stride(1), + logits.stride(2), + o.stride(0), + o.stride(1), + MAX_KV_SPLITS=MAX_KV_SPLITS, + MIN_BLOCK_KV=_MIN_BLOCK_KV, + BLOCK_DV=BLOCK_DV, + Lv=Lv, +# num_warps=4, +# num_stages=2, +# **extra_kargs, + **config, + ) + + +def decode_attention_fwd_normal( + q, + k_buffer, + v_buffer, + o, + kv_indptr, + kv_indices, + attn_logits, + attn_lse, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap=0.0, +): + _decode_att_m_fwd( + q, + k_buffer, + v_buffer, + attn_logits, + attn_lse, + kv_indptr, + kv_indices, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap, + ) + _decode_softmax_reducev_fwd( + attn_logits, + attn_lse, + q, + o, + v_buffer, + kv_indptr, + num_kv_splits, + max_kv_splits, + ) + + +def decode_attention_fwd_grouped( + q, + k_buffer, + v_buffer, + o, + kv_indptr, + kv_indices, + attn_logits, + attn_lse, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap=0.0, +): + _decode_grouped_att_m_fwd( + q, + k_buffer, + v_buffer, + attn_logits, + attn_lse, + kv_indptr, + kv_indices, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap, + ) + _decode_softmax_reducev_fwd( + attn_logits, + attn_lse, + q, + o, + v_buffer, + kv_indptr, + num_kv_splits, + max_kv_splits, + ) + + +def decode_attention_fwd( + q, + k_buffer, + v_buffer, + o, + kv_indptr, + kv_indices, + attn_logits, + attn_lse, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap=0.0, +): + assert max_kv_splits == attn_logits.shape[2] + assert q.shape[0] <= kv_indptr.shape[0] - 1 + assert q.shape[0] <= attn_logits.shape[0] + + kv_group_num = q.shape[1] // v_buffer.shape[1] + + if kv_group_num == 1: + # MHA + decode_attention_fwd_normal( + q, + k_buffer, + v_buffer, + o, + kv_indptr, + kv_indices, + attn_logits, + attn_lse, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap, + ) + else: + # GQA/MQA/MLA + decode_attention_fwd_grouped( + q, + k_buffer, + v_buffer, + o, + kv_indptr, + kv_indices, + attn_logits, + attn_lse, + num_kv_splits, + max_kv_splits, + sm_scale, + logit_cap, + ) diff --git a/aiter/ops/triton/hstu_attention.py b/aiter/ops/triton/hstu_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..be150f149cd4209666fde5859dbeae1aedff1a58 --- /dev/null +++ b/aiter/ops/triton/hstu_attention.py @@ -0,0 +1,1294 @@ + +# Copyright (c) 2024, The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Tuple + +import torch + +# @manual=//triton:triton +import triton + +# @manual=//triton:triton +import triton.language as tl +import functools +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter.ops.triton.utils.common_utils import ( + prev_power_of_2, + autotune_max_seq_len, + switch_to_contiguous_if_needed, +) + +import json + +try: + from triton.language.extra.libdevice import ( + fast_dividef, + fast_expf, + ) # @manual=//triton:triton +except ImportError: + try: + # @manual=//triton:triton + from triton.language.extra.hip.libdevice import fast_dividef, fast_expf + except ImportError: + # pyre-ignore[21] + from triton.language.math import ( + fast_dividef, + fast_expf, + ) # @manual=//triton:triton + + +def _get_fw_configs() -> List[triton.Config]: # noqa: C901 + configs = [] + for BLOCK_M in [32, 64, 128]: + for BLOCK_N in [32, 64]: + for num_stages in [1, 2]: + for num_warps in [4, 8]: + for matrix_instr_nonkdim in [16, 32]: + configs.append( + triton.Config( + { + "BLOCK_M": BLOCK_M, + "BLOCK_N": BLOCK_N, + "matrix_instr_nonkdim": matrix_instr_nonkdim, + "waves_per_eu": 0, + "kpack": 2, + }, + num_stages=num_stages, + num_warps=num_warps, + ) + ) + + return configs + + +@triton.jit +def _hstu_attn_fwd_one_block( # noqa: C901 + start_n, + seq_len, + offs_m, + offs_n, + q, + K_block_ptr, + V_block_ptr, + n_targets, + alpha, + MAX_SEQ_LEN, + contextual_seq_len, + max_attn_len, + CAUSAL: tl.constexpr, + HAS_MULTIPLE_TARGETS: tl.constexpr, + HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr, + HAS_MAX_ATTN_LEN: tl.constexpr, + ALLOW_TF32: tl.constexpr, + BLOCK_N: tl.constexpr, +): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load(K_block_ptr, boundary_check=(1,), padding_option="zero") + qk = tl.dot(q, k, allow_tf32=ALLOW_TF32) * alpha + invalid_mask = offs_m[:, None] == offs_n[None, :] + max_ids = seq_len + if HAS_CONTEXTUAL_SEQ_LEN: + offs_m = offs_m - contextual_seq_len + 1 + offs_m = tl.where( + offs_m > 0, + offs_m, + 0, + ) + offs_n = offs_n - contextual_seq_len + 1 + offs_n = tl.where( + offs_n > 0, + offs_n, + 0, + ) + max_ids = max_ids - contextual_seq_len + 1 + if HAS_MULTIPLE_TARGETS: + max_ids = max_ids - n_targets + offs_m = tl.where( + offs_m < max_ids, + offs_m, + max_ids, + ) + offs_n = tl.where( + offs_n < max_ids, + offs_n, + max_ids, + ) + offs_m_minus_n = offs_m[:, None] - offs_n[None, :] + if not CAUSAL: + offs_m_minus_n = tl.where(offs_m_minus_n > 0, offs_m_minus_n, -offs_m_minus_n) + invalid_mask = invalid_mask | (offs_m_minus_n > 0) + if HAS_MAX_ATTN_LEN: + invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len + if HAS_CONTEXTUAL_SEQ_LEN: + invalid_mask = invalid_mask or ( + offs_m[:, None] == 0 and offs_n[None, :] < max_ids + ) + # pyre-fixme[16]: Module `math` has no attribute `fast_dividef`. + silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * (1.0 / MAX_SEQ_LEN) + silu = tl.where(invalid_mask, silu, 0) + v = tl.load(V_block_ptr, boundary_check=(0,), padding_option="zero") + silu = silu.to(v.dtype) + return tl.dot(silu, v, allow_tf32=ALLOW_TF32) + + +@triton.jit +def _hstu_attn_fwd_compute( # noqa C901 + Q, + K, + V, + seq_offsets, + num_targets, + Out, + stride_qm, + stride_qh, + stride_kn, + stride_kh, + stride_vn, + stride_vh, + stride_om, + stride_oh, + alpha, + MAX_SEQ_LEN, + DeltaSize, + contextual_seq_len, + max_attn_len, + off_z, + off_h, + pid, + CAUSAL: tl.constexpr, + HAS_MULTIPLE_TARGETS: tl.constexpr, + IS_DELTA_Q: tl.constexpr, + ALLOW_TF32: tl.constexpr, + BLOCK_D_Q: tl.constexpr, + BLOCK_D_V: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr, + HAS_MAX_ATTN_LEN: tl.constexpr, +): + seq_start = tl.load(seq_offsets + off_z).to(tl.int64) + off_h = off_h.to(tl.int64) + off_z = off_z.to(tl.int64) + seq_end = tl.load(seq_offsets + off_z + 1) + seq_len = (seq_end - seq_start).to(tl.int32) + if IS_DELTA_Q: + start_m_delta = pid * BLOCK_M + start_m = (start_m_delta + seq_len - DeltaSize).to(tl.int32) + else: + start_m_delta = 0 + start_m = pid * BLOCK_M + if start_m < seq_len: + if HAS_MULTIPLE_TARGETS: + n_targets = tl.load(num_targets + off_z).to(tl.int32) + else: + n_targets = None + + # initialize offsets + offs_m = start_m + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + if IS_DELTA_Q: + Q_block_ptr = tl.make_block_ptr( + base=Q + off_h * stride_qh + off_z * DeltaSize * stride_qm, + shape=(DeltaSize, BLOCK_D_Q), + strides=(stride_qm, 1), + offsets=(start_m_delta, 0), + block_shape=(BLOCK_M, BLOCK_D_Q), + order=(1, 0), + ) + else: + Q_block_ptr = tl.make_block_ptr( + base=Q + off_h * stride_qh + seq_start * stride_qm, + shape=(seq_len, BLOCK_D_Q), + strides=(stride_qm, 1), + offsets=(start_m, 0), + block_shape=(BLOCK_M, BLOCK_D_Q), + order=(1, 0), + ) + K_block_ptr = tl.make_block_ptr( + base=K + off_h * stride_kh + seq_start * stride_kn, + shape=(BLOCK_D_Q, seq_len), + strides=(1, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_D_Q, BLOCK_N), + order=(0, 1), + ) + V_block_ptr = tl.make_block_ptr( + base=V + off_h * stride_vh + seq_start * stride_vn, + shape=(seq_len, BLOCK_D_V), + strides=(stride_vn, 1), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_D_V), + order=(1, 0), + ) + + q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option="zero") + acc = tl.zeros([BLOCK_M, BLOCK_D_V], dtype=tl.float32) + if CAUSAL: + if HAS_MULTIPLE_TARGETS: + uih_end = seq_len - n_targets + else: + uih_end = seq_len + if HAS_CONTEXTUAL_SEQ_LEN is True and start_m < contextual_seq_len: + # uih_end must be larger than start_m + low = 0 + high = seq_len + else: + low = 0 + high = start_m + BLOCK_M + if HAS_MAX_ATTN_LEN: + if start_m > uih_end: + low = uih_end - max_attn_len + else: + low = start_m - max_attn_len + if HAS_CONTEXTUAL_SEQ_LEN: + low = low if low > contextual_seq_len else 0 + else: + low = low if low > 0 else 0 + if HAS_MULTIPLE_TARGETS: + uih_end = (uih_end + BLOCK_N - 1) // BLOCK_N * BLOCK_N + if uih_end < start_m: + high = seq_len - n_targets + else: + low = 0 + high = seq_len + + if low > 0: + K_block_ptr = tl.advance(K_block_ptr, (0, low)) + V_block_ptr = tl.advance(V_block_ptr, (low, 0)) + end_n = low + for start_n in range(low, high, BLOCK_N): + acc += _hstu_attn_fwd_one_block( + start_n=start_n, + seq_len=seq_len, + offs_m=offs_m, + offs_n=offs_n + start_n, + q=q, + K_block_ptr=K_block_ptr, + V_block_ptr=V_block_ptr, + n_targets=n_targets if HAS_MULTIPLE_TARGETS else None, + alpha=alpha, + MAX_SEQ_LEN=MAX_SEQ_LEN, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + CAUSAL=CAUSAL, + HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS, + HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN, + HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN, + ALLOW_TF32=ALLOW_TF32, + BLOCK_N=BLOCK_N, + ) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + end_n += BLOCK_N + + if HAS_MULTIPLE_TARGETS and CAUSAL: + # pyre-ignore[61] + if uih_end < start_m: + low_delta = start_m + high_delta = start_m + BLOCK_M + offset = (low_delta - end_n).to(tl.int32) + K_block_ptr = tl.advance(K_block_ptr, (0, offset)) + V_block_ptr = tl.advance(V_block_ptr, (offset, 0)) + for start_delta in tl.range( + low_delta, high_delta, BLOCK_N, num_stages=0 + ): + acc += _hstu_attn_fwd_one_block( + start_n=start_delta, + seq_len=seq_len, + offs_m=offs_m, + offs_n=offs_n + start_delta, + q=q, + K_block_ptr=K_block_ptr, + V_block_ptr=V_block_ptr, + n_targets=n_targets if HAS_MULTIPLE_TARGETS else None, + alpha=alpha, + MAX_SEQ_LEN=MAX_SEQ_LEN, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + CAUSAL=CAUSAL, + HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS, + HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN, + HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN, + ALLOW_TF32=ALLOW_TF32, + BLOCK_N=BLOCK_N, + ) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + + if IS_DELTA_Q: + start_m_delta = pid * BLOCK_M + offs_m_delta = start_m_delta + tl.arange(0, BLOCK_M) + offs_v_d = tl.arange(0, BLOCK_D_V) + off_o = Out + off_z * DeltaSize * stride_om + off_h * stride_oh + out_ptrs = off_o + offs_m_delta[:, None] * stride_om + offs_v_d[None, :] + tl.store(out_ptrs, acc, mask=(offs_m_delta < DeltaSize)[:, None]) + else: + # rematerialize offsets to save registers + start_m = pid * BLOCK_M + offs_m = start_m + tl.arange(0, BLOCK_M) + offs_v_d = tl.arange(0, BLOCK_D_V) + off_o = Out + seq_start * stride_om + off_h * stride_oh + out_ptrs = off_o + offs_m[:, None] * stride_om + offs_v_d[None, :] + tl.store(out_ptrs, acc, mask=(offs_m < seq_len)[:, None]) + + +@triton.jit +def _hstu_attn_fwd( # noqa C901 + Q, + K, + V, + sort_by_length_indices, + seq_offsets, + num_targets, + Out, + stride_qm, + stride_qh, + stride_kn, + stride_kh, + stride_vn, + stride_vh, + stride_om, + stride_oh, + alpha, + Z, + AUTOTUNE_Z, + H, + MAX_SEQ_LEN, + AUTOTUNE_MAX_SEQ_LEN, # Quantized MAX_SEQ_LEN used as an autotuning key + DimQ, + DimV, + DeltaSize, + contextual_seq_len, + max_attn_len, + CAUSAL: tl.constexpr, + HAS_MULTIPLE_TARGETS: tl.constexpr, + IS_DELTA_Q: tl.constexpr, + ALLOW_TF32: tl.constexpr, + BLOCK_D_Q: tl.constexpr, + BLOCK_D_V: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr, + HAS_MAX_ATTN_LEN: tl.constexpr, + HAS_SORT_BY_LENGTH_INDICES: tl.constexpr, +): + off_hz = tl.program_id(1) + off_z = off_hz // H + if HAS_SORT_BY_LENGTH_INDICES: + off_z = tl.load(sort_by_length_indices + off_z) + off_h = off_hz % H + pid = tl.program_id(0) + _hstu_attn_fwd_compute( + Q=Q, + K=K, + V=V, + seq_offsets=seq_offsets, + num_targets=num_targets, + Out=Out, + stride_qm=stride_qm, + stride_qh=stride_qh, + stride_kn=stride_kn, + stride_kh=stride_kh, + stride_vn=stride_vn, + stride_vh=stride_vh, + stride_om=stride_om, + stride_oh=stride_oh, + alpha=alpha, + MAX_SEQ_LEN=MAX_SEQ_LEN, + DeltaSize=DeltaSize, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + off_z=off_z, + off_h=off_h, + pid=pid, + CAUSAL=CAUSAL, + HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS, + IS_DELTA_Q=IS_DELTA_Q, + ALLOW_TF32=ALLOW_TF32, + BLOCK_D_Q=BLOCK_D_Q, + BLOCK_D_V=BLOCK_D_V, + HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN, + HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + ) + + +@triton.jit +def _hstu_attn_bwd_one_block( # noqa C901 + start_m, + offs_n, + offs_m, + q_ptrs_trans, + dq_ptrs_trans, + mask_n, + do_ptrs, + dk, + dv, + k, + v, + pos_offs_n, + seq_len, + n_targets, + max_ids, + contextual_seq_len, + max_attn_len, + LOCK, + stride_qm, + stride_dom, + stride_dqm, + alpha, + MAX_SEQ_LEN, + CAUSAL: tl.constexpr, + HAS_MULTIPLE_TARGETS: tl.constexpr, + HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr, + HAS_MAX_ATTN_LEN: tl.constexpr, + ALLOW_TF32: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + ATOMIC_ADD: tl.constexpr, +): + pos_offs_m = offs_m + start_m + mask_m = pos_offs_m < seq_len + invalid_mask_trans = pos_offs_m[None, :] == offs_n[:, None] + # recompute qk and silu + if HAS_CONTEXTUAL_SEQ_LEN: + pos_offs_m = pos_offs_m - contextual_seq_len + 1 + pos_offs_m = tl.where( + pos_offs_m > 0, + pos_offs_m, + 0, + ) + if HAS_MULTIPLE_TARGETS: + pos_offs_m = tl.where( + pos_offs_m < max_ids, + pos_offs_m, + max_ids, + ) + q_trans = tl.load( + q_ptrs_trans + start_m * stride_qm, + mask=mask_m[None, :], + other=0.0, + ) + qk_trans = tl.dot(k, q_trans, allow_tf32=ALLOW_TF32) * alpha + # pyre-fixme[16]: Module `math` has no attribute `fast_dividef`. + sig_trans = fast_dividef(1.0, 1.0 + tl.exp(-qk_trans)) + silu_trans = qk_trans * sig_trans * (1.0 / MAX_SEQ_LEN) + pos_offs_m_minus_n = pos_offs_m[None, :] - pos_offs_n[:, None] + if not CAUSAL: + pos_offs_m_minus_n = tl.where( + pos_offs_m_minus_n > 0, pos_offs_m_minus_n, -pos_offs_m_minus_n + ) + invalid_mask_trans = invalid_mask_trans | (pos_offs_m_minus_n > 0) + if HAS_MAX_ATTN_LEN: + invalid_mask_trans = invalid_mask_trans and pos_offs_m_minus_n <= max_attn_len + if HAS_CONTEXTUAL_SEQ_LEN: + invalid_mask_trans = invalid_mask_trans or ( + pos_offs_m[None, :] == 0 and pos_offs_n[:, None] < max_ids + ) + silu_trans = tl.where(invalid_mask_trans, silu_trans, 0) + silu_trans = silu_trans.to(k.dtype) + # compute dv + do = tl.load( + do_ptrs + start_m * stride_dom, + mask=mask_m[:, None], + other=0.0, + ) + dv += tl.dot(silu_trans, do, allow_tf32=ALLOW_TF32) + + # compute dk and dq + dqk_trans = tl.dot(v, tl.trans(do), allow_tf32=ALLOW_TF32) + dqk_trans = ( + dqk_trans * sig_trans * (1 + qk_trans * (1 - sig_trans)) * (1.0 / MAX_SEQ_LEN) + ) + dqk_trans = tl.where(invalid_mask_trans, dqk_trans, 0) + dqk_trans = dqk_trans.to(k.dtype) + + # Note: the factor `alpha` is delayed until the end of the function to reduce the cost + dk += tl.dot(dqk_trans, tl.trans(q_trans), allow_tf32=ALLOW_TF32) + if ATOMIC_ADD: + lock_id = start_m // BLOCK_M + stride_lock = tl.cdiv(MAX_SEQ_LEN, BLOCK_M) + lock = LOCK + tl.program_id(0) * stride_lock + lock_id + tl.debug_barrier() # add a barrier to force sync + while tl.atomic_cas(lock, 0, 1) == 1: + pass + dq_trans = tl.load( + dq_ptrs_trans + start_m * stride_dqm, + mask=mask_m[None, :], + other=0.0, + eviction_policy="evict_last", + ) + dq_trans += tl.dot(tl.trans(k), dqk_trans, allow_tf32=ALLOW_TF32) * alpha + dq_trans = dq_trans.to(k.dtype) + tl.store( + dq_ptrs_trans + start_m * stride_dqm, + dq_trans, + mask=mask_m[None, :], + eviction_policy="evict_last", + ) + if ATOMIC_ADD: + tl.atomic_xchg(lock, 0) # pyre-ignore [61] + return dk, dv + + +@triton.jit +def _hstu_attn_bwd_one_col_block( # noqa C901 + start_n, + seq_len, + n_targets, + contextual_seq_len, + max_attn_len, + Q, + K, + V, + DOut, + DQ, + DK, + DV, + LOCK, + stride_qm, + stride_kn, + stride_vn, + stride_dom, + stride_dqm, + stride_dkn, + stride_dvn, + alpha, + MAX_SEQ_LEN, + CAUSAL: tl.constexpr, + HAS_MULTIPLE_TARGETS: tl.constexpr, + HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr, + HAS_MAX_ATTN_LEN: tl.constexpr, + ALLOW_TF32: tl.constexpr, + BLOCK_D_Q: tl.constexpr, + BLOCK_D_V: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + UNROLL: tl.constexpr, + ATOMIC_ADD: tl.constexpr, +): + # Work on the subsequence dv[start_n, start_n + BLOCK_N, :] + if CAUSAL: + if HAS_MULTIPLE_TARGETS: + low = start_n + if HAS_MAX_ATTN_LEN: + high = start_n + max_attn_len + BLOCK_N + high = high if high + n_targets < seq_len else seq_len + else: + high = seq_len + else: + low = start_n + if HAS_MAX_ATTN_LEN: + high = start_n + max_attn_len + BLOCK_N + high = high if high < seq_len else seq_len + else: + high = seq_len + if HAS_CONTEXTUAL_SEQ_LEN: + contextual_block_end = tl.cdiv(contextual_seq_len, BLOCK_M) * BLOCK_M + if low < contextual_block_end: + low = contextual_block_end + else: + low = 0 + high = start_n + BLOCK_N + + # initialize row/col offsets + offs_m = tl.arange(0, BLOCK_M) + offs_qk_d = tl.arange(0, BLOCK_D_Q) + offs_v_d = tl.arange(0, BLOCK_D_V) + offs_n = start_n + tl.arange(0, BLOCK_N) + + # initialize pointers to value-like data + q_ptrs_trans = Q + (offs_m[None, :] * stride_qm + offs_qk_d[:, None]) + dq_ptrs_trans = DQ + (offs_m[None, :] * stride_dqm + offs_qk_d[:, None]) + k_ptrs = K + (offs_n[:, None] * stride_kn + offs_qk_d[None, :]) + v_ptrs = V + (offs_n[:, None] * stride_vn + offs_v_d[None, :]) + mask_n = offs_n < seq_len + + do_ptrs = DOut + (offs_m[:, None] * stride_dom + offs_v_d[None, :]) + # initialize dv and dk + dv = tl.zeros([BLOCK_N, BLOCK_D_V], dtype=tl.float32) + dk = tl.zeros([BLOCK_N, BLOCK_D_Q], dtype=tl.float32) + # k and v stay in SRAM throughout + k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0) + v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0) + max_ids = seq_len + if HAS_CONTEXTUAL_SEQ_LEN: + pos_offs_n = offs_n - contextual_seq_len + 1 + pos_offs_n = tl.where( + pos_offs_n > 0, + pos_offs_n, + 0, + ) + max_ids = max_ids - contextual_seq_len + 1 + else: + pos_offs_n = offs_n + if HAS_MULTIPLE_TARGETS: + max_ids = max_ids - n_targets + pos_offs_n = tl.where( + pos_offs_n < max_ids, + pos_offs_n, + max_ids, + ) + # loop over rows + if HAS_CONTEXTUAL_SEQ_LEN and CAUSAL: + for start_m in range(0, contextual_seq_len, BLOCK_M): + start_m = tl.multiple_of(start_m, BLOCK_M) + dk, dv = _hstu_attn_bwd_one_block( + start_m=start_m, + offs_n=offs_n, + offs_m=offs_m, + q_ptrs_trans=q_ptrs_trans, + dq_ptrs_trans=dq_ptrs_trans, + mask_n=mask_n, + do_ptrs=do_ptrs, + dk=dk, + dv=dv, + k=k, + v=v, + pos_offs_n=pos_offs_n, + seq_len=seq_len, + n_targets=n_targets, + max_ids=max_ids, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + LOCK=LOCK, + stride_qm=stride_qm, + stride_dom=stride_dom, + stride_dqm=stride_dqm, + alpha=alpha, + MAX_SEQ_LEN=MAX_SEQ_LEN, + CAUSAL=CAUSAL, + HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS, + HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN, + HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN, + ALLOW_TF32=ALLOW_TF32, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + ATOMIC_ADD=ATOMIC_ADD, + ) + for start_m in tl.range(low, high, BLOCK_M, loop_unroll_factor=UNROLL): + start_m = tl.multiple_of(start_m, BLOCK_M) + dk, dv = _hstu_attn_bwd_one_block( + start_m=start_m, + offs_n=offs_n, + offs_m=offs_m, + q_ptrs_trans=q_ptrs_trans, + dq_ptrs_trans=dq_ptrs_trans, + mask_n=mask_n, + do_ptrs=do_ptrs, + dk=dk, + dv=dv, + k=k, + v=v, + pos_offs_n=pos_offs_n, + seq_len=seq_len, + n_targets=n_targets, + max_ids=max_ids, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + LOCK=LOCK, + stride_qm=stride_qm, + stride_dom=stride_dom, + stride_dqm=stride_dqm, + alpha=alpha, + MAX_SEQ_LEN=MAX_SEQ_LEN, + CAUSAL=CAUSAL, + HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS, + HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN, + HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN, + ALLOW_TF32=ALLOW_TF32, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + ATOMIC_ADD=ATOMIC_ADD, + ) + # write-back + dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_v_d[None, :]) + dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_qk_d[None, :]) + dk = dk * alpha + tl.store(dv_ptrs, dv.to(k.dtype), mask=mask_n[:, None]) + tl.store(dk_ptrs, dk.to(k.dtype), mask=mask_n[:, None]) + + +def _bwd_pre_hook(nargs): + nargs["DQ"].zero_() + if nargs["SEQUENCE_PARALLEL"] is True: + nargs["LOCK"].zero_() + + +def _get_bw_configs() -> List[triton.Config]: + configs = [] + for BLOCK_M in [32, 64]: + for BLOCK_N in [32, 64]: + for num_stages in [1, 2]: + for num_warps in [4, 8]: + for matrix_instr_nonkdim in [16, 32]: + for waves_per_eu in [0, 2, 4]: + for sp in [True, False]: + configs.append( + triton.Config( + { + "BLOCK_M": BLOCK_M, + "BLOCK_N": BLOCK_N, + "matrix_instr_nonkdim": matrix_instr_nonkdim, + "waves_per_eu": waves_per_eu, + "SEQUENCE_PARALLEL": sp, + "UNROLL": 1, + }, + num_stages=num_stages, + num_warps=num_warps, + pre_hook=_bwd_pre_hook, + ) + ) + return configs + + +@triton.jit +def _hstu_attn_bwd( # noqa C901 + Q, + K, + V, + sort_by_length_indices, + seq_offsets, + num_targets, + DOut, + DQ, + DK, + DV, + LOCK, + stride_qm, + stride_qh, + stride_kn, + stride_kh, + stride_vn, + stride_vh, + stride_dom, + stride_doh, + stride_dqm, + stride_dqh, + stride_dkn, + stride_dkh, + stride_dvn, + stride_dvh, + alpha, + contextual_seq_len, + max_attn_len, + Z, + AUTOTUNE_Z, + H, + MAX_SEQ_LEN, + AUTOTUNE_MAX_SEQ_LEN, # Quantized MAX_SEQ_LEN used as an autotuning key + DimQ, + DimV, + CAUSAL: tl.constexpr, + HAS_MULTIPLE_TARGETS: tl.constexpr, + HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr, + HAS_MAX_ATTN_LEN: tl.constexpr, + ALLOW_TF32: tl.constexpr, + BLOCK_D_Q: tl.constexpr, + BLOCK_D_V: tl.constexpr, + SEQUENCE_PARALLEL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + UNROLL: tl.constexpr, + HAS_SORT_BY_LENGTH_INDICES: tl.constexpr, +): + off_hz = tl.program_id(0) + off_z = off_hz // H + if HAS_SORT_BY_LENGTH_INDICES: + off_z = tl.load(sort_by_length_indices + off_z) + off_h = off_hz % H + off_h = off_h.to(tl.int64) + seq_start = tl.load(seq_offsets + off_z).to(tl.int64) + seq_end = tl.load(seq_offsets + off_z + 1) + seq_len = (seq_end - seq_start).to(tl.int32) + if HAS_MULTIPLE_TARGETS: + n_targets = tl.load(num_targets + off_z).to(tl.int32) + else: + n_targets = None + # offset pointers for batch/head + Q = Q + seq_start * stride_qm + off_h * stride_qh + K = K + seq_start * stride_kn + off_h * stride_kh + V = V + seq_start * stride_vn + off_h * stride_vh + DOut = DOut + seq_start * stride_dom + off_h * stride_doh + DQ = DQ + seq_start * stride_dqm + off_h * stride_dqh + DK = DK + seq_start * stride_dkn + off_h * stride_dkh + DV = DV + seq_start * stride_dvn + off_h * stride_dvh + if SEQUENCE_PARALLEL: + start_n = tl.program_id(1) * BLOCK_N + if start_n >= seq_len: + return + _hstu_attn_bwd_one_col_block( + start_n=start_n, + seq_len=seq_len, + n_targets=n_targets, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + Q=Q, + K=K, + V=V, + DOut=DOut, + DQ=DQ, + DK=DK, + DV=DV, + LOCK=LOCK, + stride_qm=stride_qm, + stride_kn=stride_kn, + stride_vn=stride_vn, + stride_dom=stride_dom, + stride_dqm=stride_dqm, + stride_dkn=stride_dkn, + stride_dvn=stride_dvn, + alpha=alpha, + MAX_SEQ_LEN=MAX_SEQ_LEN, + CAUSAL=CAUSAL, + HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS, + HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN, + HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN, + ALLOW_TF32=ALLOW_TF32, + BLOCK_D_Q=BLOCK_D_Q, + BLOCK_D_V=BLOCK_D_V, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + UNROLL=UNROLL, + ATOMIC_ADD=True, + ) + else: + for start_n in range(0, seq_len, BLOCK_N): + _hstu_attn_bwd_one_col_block( + start_n=start_n, + seq_len=seq_len, + n_targets=n_targets, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + Q=Q, + K=K, + V=V, + DOut=DOut, + DQ=DQ, + DK=DK, + DV=DV, + LOCK=LOCK, + stride_qm=stride_qm, + stride_kn=stride_kn, + stride_vn=stride_vn, + stride_dom=stride_dom, + stride_dqm=stride_dqm, + stride_dkn=stride_dkn, + stride_dvn=stride_dvn, + alpha=alpha, + MAX_SEQ_LEN=MAX_SEQ_LEN, + CAUSAL=CAUSAL, + HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS, + HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN, + HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN, + ALLOW_TF32=ALLOW_TF32, + BLOCK_D_Q=BLOCK_D_Q, + BLOCK_D_V=BLOCK_D_V, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + UNROLL=UNROLL, + ATOMIC_ADD=False, + ) + + +@functools.lru_cache(maxsize=1024) +def _get_fwd_config( + AUTOTUNE_Z: int, + H: int, + AUTOTUNE_MAX_SEQ_LEN: int, + DimQ: int, + DimV: int, + DeltaSize: int, + IS_DELTA_Q: bool, +): + if not hasattr(_get_fwd_config, "_config_dict"): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/hstu_attn/{dev}-HSTU_ATTN_FWD.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_fwd_config._config_dict = config + + if AUTOTUNE_Z < 512: + batch_key = "small_batch" + elif AUTOTUNE_Z == 512: + batch_key = "batch_512" + else: + batch_key = "large_batch" + + return _get_fwd_config._config_dict[batch_key] + + +def triton_hstu_attention_fwd( + N: int, + alpha: float, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + seq_offsets: torch.Tensor, + causal: bool, + num_targets: Optional[torch.Tensor], + max_attn_len: int, + contextual_seq_len: int, + sort_by_length_indices: Optional[torch.Tensor], + config: Optional[dict] = None, +) -> torch.Tensor: + """ + Computes HSTU attention fwd pass, compute the math dot(silu(dot(q * trans(k))) * v). inputs q, kv are of the jagged formats + + Key parameters: + - N: max sequence length + - alpha: scale parameter to multiply output of first dot + - q: tensor with shape (L, H, D), L are sum of lengths of all sequences + - k: tensor with shape (L, H, D), L are sum of lengths of all sequences + - v: tensor with shape (L, H, D), L are sum of lengths of all sequences + - seq_offsets: tensor with shape (B + 1), indicates lengths of each sequences. + - causal: whether use causal mask. + - num_targets: number of targets. + - contextual_seq_len: contexual sequence length. + - sort_by_length_indices: indices of sequences sorted by lengths + - config: Optional, tuning configs to run the kernel + + Returns: + - Y: output with the shape (L, H, D). + """ + Z = seq_offsets.numel() - 1 + AUTOTUNE_Z = prev_power_of_2(Z) + L, H, DimQ = q.shape + _, _, DimV = v.shape + out = torch.empty_like(v) + has_multiple_targets = num_targets is not None + has_contextual_seq_len = contextual_seq_len > 0 + has_max_attn_len = max_attn_len > 0 + has_sort_by_length_indices = sort_by_length_indices is not None + if L == 0: + return out + + max_seq_len = autotune_max_seq_len(N) + DeltaSize = 0 + IS_DELTA_Q = False + + if config is None: + config = _get_fwd_config( + AUTOTUNE_Z, H, max_seq_len, DimQ, DimV, DeltaSize, IS_DELTA_Q + ) + + grid = lambda meta: ( # noqa E731 + triton.cdiv(N, meta["BLOCK_M"]), + Z * H, + ) + + _hstu_attn_fwd[grid]( + Q=q, + K=k, + V=v, + sort_by_length_indices=sort_by_length_indices, + seq_offsets=seq_offsets, + num_targets=num_targets, + Out=out, + stride_qm=q.stride(0), + stride_qh=q.stride(1), + stride_kn=k.stride(0), + stride_kh=k.stride(1), + stride_vn=v.stride(0), + stride_vh=v.stride(1), + stride_om=out.stride(0), + stride_oh=out.stride(1), + alpha=alpha, + Z=Z, + AUTOTUNE_Z=AUTOTUNE_Z, + H=H, + MAX_SEQ_LEN=N, + AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N), + DimQ=DimQ, + DimV=DimV, + DeltaSize=DeltaSize, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + CAUSAL=causal, + HAS_MULTIPLE_TARGETS=has_multiple_targets, + IS_DELTA_Q=IS_DELTA_Q, + ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32, + BLOCK_D_Q=DimQ, + BLOCK_D_V=DimV, + HAS_CONTEXTUAL_SEQ_LEN=has_contextual_seq_len, + HAS_MAX_ATTN_LEN=has_max_attn_len, + HAS_SORT_BY_LENGTH_INDICES=has_sort_by_length_indices, + **config, + ) + + return out + + +@functools.lru_cache(maxsize=1024) +def _get_bwd_config( + AUTOTUNE_Z: int, + H: int, + AUTOTUNE_MAX_SEQ_LEN: int, + DimQ: int, + DimV: int, +): + if not hasattr(_get_bwd_config, "_config_dict"): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/hstu_attn/{dev}-HSTU_ATTN_BWD.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_bwd_config._config_dict = config + + if AUTOTUNE_Z < 512: + batch_key = "small_batch" + else: + batch_key = "large_batch" + + return _get_bwd_config._config_dict[batch_key] + + +def triton_hstu_attention_bwd( + dout: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + dq: torch.Tensor, + dk: torch.Tensor, + dv: torch.Tensor, + seq_offsets: torch.Tensor, + num_targets: Optional[torch.Tensor], + N: int, + alpha: float, + max_attn_len: int, + causal: float, + contextual_seq_len: int, + sort_by_length_indices: Optional[torch.Tensor], + config: Optional[dict] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Computes HSTU attention bwd pass. + + Key parameters: + - dout: tensor with shape (L, H, D) + - q: tensor with shape (L, H, D), L are sum of lengths of all sequences + - k: tensor with shape (L, H, D), L are sum of lengths of all sequences + - v: tensor with shape (L, H, D), L are sum of lengths of all sequences + - dq: tensor with shape (L, H, D), gradients of q + - dk: tensor with shape (L, H, D), gradients of k + - dv: tensor with shape (L, H, D), gradients of v + - seq_offsets: tensor with shape (B + 1), indicates lengths of each sequences. + - num_targets: number of targets. + - N: max sequence length + - alpha: scale parameter to multiply output of first dot + - max_attn_len: max attn length + - causal: whether use causal mask. + - contextual_seq_len: contexual sequence length. + - sort_by_length_indices: indices of sequences sorted by lengths + - config: Optional, tuning configs to run the kernel + + Returns: + - dq, dk, dv: gradients of q, k, and v + """ + dout = switch_to_contiguous_if_needed(dout) + dq = switch_to_contiguous_if_needed(dq) + dk = switch_to_contiguous_if_needed(dk) + dv = switch_to_contiguous_if_needed(dv) + if dout.shape[0] == 0: + return torch.zeros_like(q), torch.zeros_like(k), torch.zeros_like(v) + Z = seq_offsets.numel() - 1 + _, H, DimQ = q.shape + _, _, DimV = v.shape + + max_seq_len = autotune_max_seq_len(N) + AUTOTUNE_Z = prev_power_of_2(Z) + if config is None: + config = _get_bwd_config(AUTOTUNE_Z, H, max_seq_len, DimQ, DimV) + + grid = lambda meta: ( # noqa E731 + Z * H, + (triton.cdiv(N, meta["BLOCK_N"]) if meta["SEQUENCE_PARALLEL"] else 1), + ) + # The minimum size of BLOCK_M used in `_get_bw_configs`. + # TODO (linjianma): avoid hardcoding the value. + MIN_BLOCK_M = 16 + lock = torch.empty( + (Z * H, triton.cdiv(N, MIN_BLOCK_M)), + dtype=torch.int32, + device=q.device, + ) + + dq.zero_() + if config["SEQUENCE_PARALLEL"] == 1: + lock.zero_() + + _hstu_attn_bwd[grid]( + Q=q, + K=k, + V=v, + sort_by_length_indices=sort_by_length_indices, + seq_offsets=seq_offsets, + num_targets=num_targets, + DOut=dout, + DQ=dq, + DK=dk, + DV=dv, + LOCK=lock, + stride_qm=q.stride(0), + stride_qh=q.stride(1), + stride_kn=k.stride(0), + stride_kh=k.stride(1), + stride_vn=v.stride(0), + stride_vh=v.stride(1), + stride_dom=dout.stride(0), + stride_doh=dout.stride(1), + stride_dqm=dq.stride(0), + stride_dqh=dq.stride(1), + stride_dkn=dk.stride(0), + stride_dkh=dk.stride(1), + stride_dvn=dv.stride(0), + stride_dvh=dv.stride(1), + alpha=alpha, + contextual_seq_len=contextual_seq_len, + max_attn_len=max_attn_len, + Z=Z, + AUTOTUNE_Z=AUTOTUNE_Z, + H=H, + MAX_SEQ_LEN=N, + AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N), + DimQ=DimQ, + DimV=DimV, + CAUSAL=causal, + HAS_MULTIPLE_TARGETS=num_targets is not None, + HAS_CONTEXTUAL_SEQ_LEN=contextual_seq_len > 0, + HAS_MAX_ATTN_LEN=max_attn_len > 0, + ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32, + BLOCK_D_Q=DimQ, + BLOCK_D_V=DimV, + HAS_SORT_BY_LENGTH_INDICES=sort_by_length_indices is not None, + **config, + ) + + return dq, dk, dv + + +class _AttentionFunction(torch.autograd.Function): + @staticmethod + # pyre-ignore[14] + def forward( + ctx, + N: int, + alpha: float, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + seq_offsets: torch.Tensor, + causal: bool, + num_targets: Optional[torch.Tensor], + max_attn_len: int, + contextual_seq_len: int, + sort_by_length: bool, + ) -> torch.Tensor: + sort_by_length_indices = None + if sort_by_length: + seq_lengths = seq_offsets[1:] - seq_offsets[:-1] + _, sort_by_length_indices = torch.sort( + seq_lengths, descending=True, stable=False + ) + saved_tensors = [q, k, v, seq_offsets] + if num_targets is not None: + saved_tensors.append(num_targets) + if sort_by_length_indices is not None: + saved_tensors.append(sort_by_length_indices) + ctx.save_for_backward(*saved_tensors) + ctx.alpha = alpha + ctx.causal = causal + ctx.has_multiple_targets = num_targets is not None + ctx.max_attn_len = max_attn_len + ctx.N = N + ctx.contextual_seq_len = contextual_seq_len + ctx.sort_by_length = sort_by_length + return triton_hstu_attention_fwd( + N=N, + alpha=alpha, + q=q, + k=k, + v=v, + seq_offsets=seq_offsets, + causal=causal, + num_targets=num_targets, + max_attn_len=max_attn_len, + contextual_seq_len=contextual_seq_len, + sort_by_length_indices=sort_by_length_indices, + ) + + @staticmethod + # pyre-ignore[14] + def backward(ctx, dout: torch.Tensor) -> Tuple[ + None, + None, + torch.Tensor, + torch.Tensor, + torch.Tensor, + None, + None, + None, + None, + None, + None, + ]: + with torch.inference_mode(): + q, k, v, seq_offsets = ctx.saved_tensors[:4] + idx = 4 + if ctx.has_multiple_targets: + num_targets = ctx.saved_tensors[idx] + idx += 1 + else: + num_targets = None + if ctx.sort_by_length: + sort_by_length_indices = ctx.saved_tensors[idx] + else: + sort_by_length_indices = None + + dq = torch.empty_like(q) + dk = torch.empty_like(k) + dv = torch.empty_like(v) + dq, dk, dv = triton_hstu_attention_bwd( + dout=dout, + q=q, + k=k, + v=v, + dq=dq, + dk=dk, + dv=dv, + seq_offsets=seq_offsets, + num_targets=num_targets, + N=ctx.N, + alpha=ctx.alpha, + max_attn_len=ctx.max_attn_len, + causal=ctx.causal, + contextual_seq_len=ctx.contextual_seq_len, + sort_by_length_indices=sort_by_length_indices, + ) + return ( + None, + None, + dq, + dk, + dv, + None, + None, + None, + None, + None, + None, + ) diff --git a/aiter/ops/triton/lean_atten.py b/aiter/ops/triton/lean_atten.py new file mode 100644 index 0000000000000000000000000000000000000000..e197483f9dda2af6c6fa11ef0914fb086294b2d1 --- /dev/null +++ b/aiter/ops/triton/lean_atten.py @@ -0,0 +1,592 @@ +# SPDX-License-Identifier: MIT + +""" +Lean Attention +=============== +This is a Triton implementation of the Lean Attention algorithm from https://arxiv.org/abs/2405.10480 +Lean Attention adopts streamK style tiling strategy, which efficiently utilize all available CUs in the system. +Lean Attention is for both decode and prefill attention of transformer based models. + +It currently supports ragged batching decode and prefill attention with causal=1 + +TO be added features: +- Add GQA support +- batch_size > 1 for prefill/causal=1 +- Misc + - N_CTX with non-integer number of BLOCK_N (pad zeros or add mask) + - +""" + +import torch + +import triton +import triton.language as tl + + +# Support tensor in [B, Seqlen, H, d] format. Taking tensors in [B*Seqlen, H, d] as inputs +def persistent_lean_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + Mp: torch.Tensor, + Lp: torch.Tensor, + Op: torch.Tensor, # (total_programs, n_ctx_q, d) + locks: torch.Tensor, + batch_num_block_n: torch.Tensor, + total_programs: int, + BLOCK_M: int, + BLOCK_N: int, + causal: bool, + batch_size: int, + sm_scale: torch.float16, + num_warps: int, + waves_per_eu: int, +): + # shape constraints + HEAD_DIM_Q, HEAD_DIM_K, HEAD_DIM_V = q.shape[-1], k.shape[-1], v.shape[-1] + assert ( + HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V + ), "Incompatible Q/K/V Hidden Dimensions" + assert HEAD_DIM_K in {16, 32, 64, 128, 256} + + # MASKED_BLOCKS is used for prefill/causal for BLOCK_M > BLOCK_N + MASKED_BLOCKS = BLOCK_M // BLOCK_N + + if causal: + # Only support BLOCK_M is multiple of BLOCK_N + # TODO: add other scenarios + assert BLOCK_M % BLOCK_N == 0 + + N_CTX_Q = q.shape[0] // batch_size + N_CTX_K = k.shape[0] # This is the sum of all ctx_n in a batch + H = q.shape[1] + + qk_scale = sm_scale * 1.44269504 + + ( + num_m_blocks, + num_n_blocks, + high_load_wgs, + max_tiles_per_wg, + tiles_per_head, + total_programs, + num_splits, + even_split, + ) = get_num_splits_and_buffer_sizes( + causal, + batch_size, + N_CTX_Q, + N_CTX_K, + H, + H, + BLOCK_M, + BLOCK_N, + total_programs, + ) + # print( + # f"high_load_wgs={high_load_wgs}, max_tiles_per_wg={max_tiles_per_wg}, tiles_per_head={tiles_per_head}" + # ) + # print( + # f"total_programs={total_programs}, num_splits={num_splits}, even_split={even_split}" + # ) + # print(f"num_m_blocks={num_m_blocks}, num_n_blocks={num_n_blocks}") + + grid = (total_programs, 1, 1) + + o = torch.empty_like(q, dtype=v.dtype) + + la_kernel = la_persistent[grid]( + False, + 0, + q, + k, + v, + qk_scale, + Mp, + Lp, + Op, + o, + batch_num_block_n, + locks, + q.stride(0), # N_CTX_Q + q.stride(1), # H + q.stride(2), # Head_Dim + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + Op.stride(0), # total_programs + Op.stride(1), # n_ctx_q + Op.stride(2), # head_dim + HEAD_DIM=HEAD_DIM_K, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + MASKED_BLOCKS=MASKED_BLOCKS, + batch_size=batch_size, + causal=causal, + num_m_blocks=num_m_blocks, + num_n_blocks=num_n_blocks, + # leanAttention params + high_load_wgs=high_load_wgs, + max_tiles_per_wg=max_tiles_per_wg, + tiles_per_head=tiles_per_head, + num_splits=num_splits, + waves_per_eu=waves_per_eu, + num_warps=num_warps, + num_stages=1, + num_ctas=1, + ) + + print(f"la kernel {la_kernel.n_regs} registers used, {la_kernel.n_spills} spills") + + return o + + +def get_num_splits_and_buffer_sizes( + causal, + batch_size, + max_seqlen_q, + max_seqlen_k, + num_heads, + num_heads_k, + BLOCK_M, + BLOCK_N, + num_SMs, +): + ##### Lean Atteion: Calculate Splits and Tile Sizes ##### + ## based on onnxruntime/contrib_ops/cuda/bert/lean_attention + num_m_blocks = (max_seqlen_q + BLOCK_M - 1) // BLOCK_M + num_n_blocks = (max_seqlen_k + BLOCK_N - 1) // BLOCK_N + + # TODO: Support Grouped-Query Attention + max_seqlen_q = max_seqlen_q * num_heads // num_heads_k + + # print(f"block_m: {BLOCK_M}, block_n: {BLOCK_N} ") + # print(f"num_m_block: {num_m_blocks}, num_n_block: {num_n_blocks} ") + # print(f"max_seqlen_q: {max_seqlen_q}, max_seqlen_k: {max_seqlen_k}") + # print(f"num_heads: {num_heads}, num_heads_k: {num_heads_k} ") + + if max_seqlen_q == 1: + causal = False + + tiles_per_head = 0 + if causal: + # Prefill - Causal + for i in range(0, num_m_blocks): + tiles_per_head += (((i + 1) * BLOCK_M) + BLOCK_N - 1) // BLOCK_N + # Does not support ragged batch for causal. + tiles_per_head = tiles_per_head * batch_size + else: + # Decode or Not Causal + tiles_per_head = num_m_blocks * num_n_blocks + + total_tiles = tiles_per_head * num_heads_k # Total tiles across all heads + + # StreamK Lean has as many threadblocks as SMs + # This should be a function of tile size and number of scratchpad space + # LeanAttention assign 3 CTAs per SM (bounded by LDS size) + lean_griddimz = num_SMs # CTA launch grid + # if (total_tiles <= 2 * 2 * num_SMs): + # lean_griddimz = min((total_tiles + 1) / 2, (32 * total_tiles + num_n_blocks - 1) / num_n_blocks) + # else: + # lean_griddimz = min(2 * num_SMs, 32 * num_heads_k * batch_size * num_m_blocks) + + # Max number lean tiles per task block (CTA) + max_tiles_per_tb = (total_tiles + lean_griddimz - 1) // lean_griddimz + + # Find max number of splits + num_splits = 0 + even_split = False + if total_tiles % lean_griddimz == 0: + even_split = True + num_splits = 1 + ((num_n_blocks + max_tiles_per_tb - 2) // (max_tiles_per_tb)) + else: + even_split = False + num_splits = 1 + ( + (num_n_blocks + max_tiles_per_tb - 3) // (max_tiles_per_tb - 1) + ) + + # high_load_tbs is the remainder of total_tile / num_cta + high_load_tbs = total_tiles - ((max_tiles_per_tb - 1) * lean_griddimz) + + # Needed for causal. This is (per batch n_ctx) // BLOCK_N + num_n_blocks = num_n_blocks // batch_size + + return ( + num_m_blocks, + num_n_blocks, + high_load_tbs, + max_tiles_per_tb, + tiles_per_head, + lean_griddimz, + num_splits, + even_split, + ) + + +@triton.jit +def find_group(x, MASKED_BLOCKS): + group_id = 0 + total_blocks = 0 + while total_blocks + (group_id + 1) * MASKED_BLOCKS <= x: + total_blocks += (group_id + 1) * MASKED_BLOCKS + group_id += 1 + group_size = (group_id + 1) * MASKED_BLOCKS + return group_id, group_size, total_blocks + + +@triton.jit +def la_persistent( + is_pod, + pod_pid, + Q, + K, + V, + qk_scale, + Mp, + Lp, + Op, + Out, + batch_num_block_n, + locks, + stride_qm, # n_ctx_q + stride_qh, # Head + stride_qk, # head_dim + stride_kn, + stride_kh, + stride_kk, + stride_vn, + stride_vh, + stride_vk, + stride_om, # n_ctx_q + stride_oh, # Head + stride_on, # head_dim + stride_oph, # total_programs + stride_opm, # n_ctx_q + stride_opn, # head_dim + HEAD_DIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + MASKED_BLOCKS: tl.constexpr, + batch_size: tl.constexpr, + causal: tl.constexpr, + num_m_blocks: tl.constexpr, + num_n_blocks: tl.constexpr, + # leanAttention params + high_load_wgs: tl.constexpr, + max_tiles_per_wg: tl.constexpr, + tiles_per_head: tl.constexpr, + num_splits: tl.constexpr, +): + if is_pod: + current_pid = pod_pid + else: + current_pid = tl.program_id(0) + + if current_pid < high_load_wgs: + iter = max_tiles_per_wg * current_pid + cta_end_tile_gid = iter + max_tiles_per_wg + else: + iter = (max_tiles_per_wg - 1) * ( + current_pid - high_load_wgs + ) + high_load_wgs * max_tiles_per_wg + cta_end_tile_gid = iter + (max_tiles_per_wg - 1) + + # Loop context length + while iter < cta_end_tile_gid: + # Calculate index of current head output tile + # The tiles_per_head is the sum of # BLOCK_N in K/V sequence of all batches + tile_head_idx = iter // tiles_per_head + + # To generate an otuput tile, a loop over [tile_iter, tile_iter_end) lean tiles is needed + # [tile_iter, tile_iter_end) are in the form of global tile id + if causal: + tile_batch_idx = (iter % tiles_per_head) // (tiles_per_head // batch_size) + # Does not support ragged batching. All requests in the batch have the same context length (per_head_tile_size) + # tiles_per_head: total sum of # BLOCK_N in K/V sequence of all batches + # per_head_tile_size: per head # BLOCK_N of each output tile + per_head_tile_idx, per_head_tile_size, total_blocks = find_group( + iter + - (tile_head_idx * tiles_per_head) + - (tile_batch_idx * (tiles_per_head // batch_size)), + MASKED_BLOCKS, + ) + tile_iter = ( + tile_head_idx * tiles_per_head + + (tile_batch_idx * (tiles_per_head // batch_size)) + + total_blocks + ) + tile_iter_end = tile_iter + (per_head_tile_size) + tile_idx = ( + tile_head_idx * batch_size + tile_batch_idx + ) * num_m_blocks + per_head_tile_idx + else: + tile_idx = ( + tile_head_idx * batch_size + ) # Output tile idx, 1 output tile per head per batch + tile_iter = tile_head_idx * tiles_per_head + if batch_size == 1: + req_size = tiles_per_head + else: + req_size = tl.load(batch_num_block_n) + tile_iter_end = tile_iter + req_size + for b in range(1, batch_size): + next_req_size = tl.load(batch_num_block_n + b) + local_head_iter = iter % tiles_per_head + if (local_head_iter < next_req_size) and (local_head_iter >= req_size): + tile_iter = tile_iter + req_size + tile_idx = tile_idx + b + tile_iter_end = tile_iter + (next_req_size - req_size) + req_size = next_req_size + # Local lean tile ID within a loop of an output tile + local_iter = iter - tile_iter + local_iter_end = tl.minimum(tile_iter_end, cta_end_tile_gid) - tile_iter + + if iter == tile_iter: + host_block = True + else: + host_block = False + # finishing_block: the output tile is finished within this block + if cta_end_tile_gid >= tile_iter_end: + finishing_block = True + else: + finishing_block = False + + offs_m = tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, HEAD_DIM) + + if causal: + b_seq_size = tile_batch_idx * num_n_blocks + else: + tile_batch_idx = tile_idx % batch_size + b_seq_size = 0 + if tile_batch_idx > 0: + b_seq_size = tl.load( + batch_num_block_n + tile_batch_idx - 1 + ) # Previous batch size + + k_offs = ( + (b_seq_size + local_iter) * BLOCK_N * stride_kn + + tile_head_idx * stride_kh + + offs_n[None, :] * stride_kn + + offs_k[:, None] * stride_kk + ) + v_offs = ( + (b_seq_size + local_iter) * BLOCK_N * stride_vn + + tile_head_idx * stride_vh + + offs_n[:, None] * stride_vn + + offs_k[None, :] * stride_vk + ) + + k_ptrs = K + k_offs + k_ptrs = tl.multiple_of(k_ptrs, (16, 1)) + v_ptrs = V + v_offs + v_ptrs = tl.multiple_of(v_ptrs, (1, 16)) + + if causal: + q_idx = per_head_tile_idx + tile_batch_idx * num_m_blocks + else: + q_idx = tile_batch_idx + q_offs = ( + q_idx * BLOCK_M * stride_qm + + tile_head_idx * stride_qh + + offs_m[:, None] * stride_qm + + offs_k[None, :] * stride_qk + ) + q_ptrs = Q + q_offs + q_ptrs = tl.multiple_of(q_ptrs, (1, 16)) + + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) + + q = tl.load(q_ptrs) + + for l_iter in range(local_iter, local_iter_end): + # -- compute qk ---- + # k = tl.load(k_ptrs, cache_modifier=".cg") + k = tl.load(k_ptrs) + qk = tl.dot(q, k) + qk = qk * qk_scale + + # if ((iter + (l_iter - local_iter)) == (tile_iter_end - 1)) and causal: + # mask = offs_m[:, None] >= offs_n[None, :] + # Apply the causal mask + # qk = tl.where(mask, qk, float("-inf")) + if causal and (MASKED_BLOCKS > 1): + if l_iter == (tile_iter_end - tile_iter) - 2: + mask = offs_m[:, None] >= offs_n[None, :] + qk = tl.where(mask, qk, float("-inf")) + if l_iter == (tile_iter_end - tile_iter) - 1: + mask = (offs_m[:, None] >= BLOCK_N) & ( + offs_n[None, :] <= (offs_m[:, None] - BLOCK_N) + ) + qk = tl.where(mask, qk, float("-inf")) + + if causal and (MASKED_BLOCKS == 1): + # if (l_iter == (tile_iter_end - tile_iter) - 1): + if (iter + (l_iter - local_iter)) == (tile_iter_end - 1): + mask = offs_m[:, None] >= offs_n[None, :] + qk = tl.where(mask, qk, float("-inf")) + + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk = qk - m_ij[:, None] + p = tl.math.exp2(qk) # p.shape = [BLOCK_M, BLOCK_N] + # -- update output accumulator -- + alpha = tl.math.exp2(m_i - m_ij) + acc = ( + acc * alpha[:, None] + ) # Scale each row of acc by the corresponding elements in alpha + # v = tl.load(v_ptrs, cache_modifier=".cg") # v.shape = [BLOCK_N, HEAD_DIM] + v = tl.load(v_ptrs) + acc += tl.dot(p.to(v.dtype), v) # acc.shape = [BLOCK_M, HEAD_DIM] + # -- update l_i + l_ij = tl.sum(p, 1) # rowsum(p) + l_i = l_i * alpha + l_ij + # update m_i + m_i = m_ij.to(m_i.dtype) + + if ( + (l_iter == (tile_iter_end - tile_iter) - 1) + and (iter == tile_iter_end - 1) + and (MASKED_BLOCKS == 2) + ): + mask1 = offs_m >= BLOCK_N + m_i = tl.where(mask1, m_i, float("-inf")) + l_i = tl.where(mask1, l_i, 1.0) + mask1 = mask1[:, None] + acc = tl.where(mask1, acc, 0.0) + + # update k/v pointer + v_ptrs += BLOCK_N * stride_vn + k_ptrs += BLOCK_N * stride_kn + + # initialize pointer to m and l + m_cta = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_cta = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + + # lean output tile epilogue + if not host_block: + # Update pointers of partial results Mp[cta], Lp[cta], Op[cta] + mp_ptrs = Mp + current_pid * BLOCK_M + offs_m + lp_ptrs = Lp + current_pid * BLOCK_M + offs_m + op_ptrs = ( + Op + + current_pid * stride_oph # stride_oph is total_program dimension + + offs_m[:, None] * stride_opm + + offs_k[None, :] * stride_opn + ) + + tl.store(mp_ptrs, m_i, cache_modifier=".wt") + tl.store(lp_ptrs, l_i, cache_modifier=".wt") + tl.store(op_ptrs, acc, cache_modifier=".wt") + tl.debug_barrier() + # tl.store(locks + current_pid, 1, cache_modifier=".wt") + # According to streamK gemm, store + cache_modifier won't work universally + # atomic_xchg is better solution but a less performant variant + tl.atomic_xchg(locks + current_pid, 1) + + if host_block: # and finishing_block: + # A host block that is also a finishing block completes all the LeanTile iterations for its output tile + # in a single CTA and so can directly store its results from LeanTile() in global memory without any reduction + acc_reshaped = tl.reshape(acc, (BLOCK_M, 2, HEAD_DIM // 2)) + acc_permuted = tl.permute(acc_reshaped, (0, 2, 1)) + acc0, acc1 = tl.split(acc_permuted) + + o_h_offs = ( + q_idx * BLOCK_M * stride_om + + tile_head_idx * stride_oh + + offs_m[:, None] * stride_om + + offs_k[None, :] * stride_on + ) + o_ptrs = Out + o_h_offs + + if not finishing_block: + # if host not finishing_block: # another CTA is processing the end of the output tile and store partial results + + last_cta = current_pid + 1 + temp_end_gid = cta_end_tile_gid + split = 1 + while (split < num_splits) and (temp_end_gid < tile_iter_end): + if last_cta < high_load_wgs: + if (tile_iter_end - temp_end_gid) < max_tiles_per_wg: + temp_end_gid += tile_iter_end - temp_end_gid + else: + temp_end_gid += max_tiles_per_wg + else: + if (tile_iter_end - temp_end_gid) < (max_tiles_per_wg - 1): + temp_end_gid += tile_iter_end - temp_end_gid + else: + temp_end_gid += max_tiles_per_wg - 1 + + last_cta += 1 + split += 1 + # Next, load nonHost partial restult + for cta in range((current_pid + 1), last_cta): + # According to treamK gemm, atomic_cas is universal solution but less performant + while tl.atomic_cas(locks + cta, 1, 1) != 1: + # while tl.load(locks + cta, cache_modifier=".cv", volatile=True) != 1: + pass + + # Partial results are stored in [nonHost, Host-nonFinishing] layout + offs_mplp = cta * BLOCK_M + offs_m + mp_ptrs = Mp + offs_mplp + lp_ptrs = Lp + offs_mplp + op_ptrs0 = ( + Op + + cta * stride_oph + + offs_m[:, None] * stride_opm + + tl.arange(0, HEAD_DIM // 2)[None, :] * stride_opn + ) + op_ptrs1 = ( + Op + + cta * stride_oph + + offs_m[:, None] * stride_opm + + (tl.arange(0, HEAD_DIM // 2)[None, :] + HEAD_DIM // 2) + * stride_opn + ) + + m_cta = tl.load(mp_ptrs) + l_cta = tl.load(lp_ptrs) + acc_cta0 = tl.load(op_ptrs0) + acc_cta1 = tl.load(op_ptrs1) + + # m_i is the host CTA's m, m_cta is other nonHost CTA's m + m_new = tl.maximum(m_cta, m_i) + alpha = tl.math.exp2(m_cta - m_new) + alpha1 = tl.math.exp2(m_i - m_new) + l_new = alpha * l_cta + alpha1 * l_i + acc0 = acc_cta0 * alpha[:, None] + acc0 * alpha1[:, None] + acc1 = acc_cta1 * alpha[:, None] + acc1 * alpha1[:, None] + # update m, l + m_i = m_new + l_i = l_new + # host CTA write final result to memory + o_ptrs0 = ( + Out + + q_idx * BLOCK_M * stride_om + + tile_head_idx * stride_oh + + offs_m[:, None] * stride_om + + tl.arange(0, HEAD_DIM // 2)[None, :] * stride_on + ) + o_ptrs1 = ( + Out + + q_idx * BLOCK_M * stride_om + + tile_head_idx * stride_oh + + offs_m[:, None] * stride_om + + (tl.arange(0, HEAD_DIM // 2)[None, :] + HEAD_DIM // 2) * stride_on + ) + + acc0 = acc0 / l_i[:, None] + acc1 = acc1 / l_i[:, None] + tl.store(o_ptrs0, acc0.to(Out.type.element_ty)) + tl.store(o_ptrs1, acc1.to(Out.type.element_ty)) + + # update iter + iter = iter + (local_iter_end - local_iter) diff --git a/aiter/ops/triton/lean_atten_paged.py b/aiter/ops/triton/lean_atten_paged.py new file mode 100644 index 0000000000000000000000000000000000000000..e43dff8c27a0f4fb1fa5c408b5684ce7828e68a3 --- /dev/null +++ b/aiter/ops/triton/lean_atten_paged.py @@ -0,0 +1,489 @@ +""" +Lean Attention + Paged Attention +================================ +This is a Triton implementation of the Lean Attention algorithm from https://arxiv.org/abs/2405.10480, enhanced +with Paged Attention from https://arxiv.org/abs/2309.06180, for the decode phase. +Lean Attention adopts streamK style tiling strategy, which efficiently utilize all available CUs in the system. + +It currently supports ragged batching decode + +TO be added features: +- Add GQA support +- Misc + - N_CTX with non-integer number of BLOCK_N (pad zeros or add mask) + - +""" + +import torch + +import triton +import triton.language as tl + + +def persistent_lean_attention_paged( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + kv_block_tables: torch.Tensor, + Mp: torch.Tensor, + Lp: torch.Tensor, + Op: torch.Tensor, + locks: torch.Tensor, + batch_num_block_n: torch.Tensor, + total_programs: int, + BLOCK_M: int, + BLOCK_N: int, + # d: int, + batch_size: int, + sm_scale: torch.float16, + num_warps: int, + waves_per_eu: int, +): + # shape constraints + HEAD_DIM_Q, HEAD_DIM_K, HEAD_DIM_V = q.shape[-1], k.shape[-1], v.shape[-1] + assert ( + HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V + ), "Incompatible Q/K/V Hidden Dimensions" + assert HEAD_DIM_K in {16, 32, 64, 128, 256} + + N_CTX_Q = q.shape[1] // batch_size + N_CTX_K = k.shape[1] # This is the sum of all ctx_n in a batch + H = q.shape[0] + + qk_scale = sm_scale * 1.44269504 + + ( + num_m_blocks, + high_load_wgs, + max_tiles_per_wg, + tiles_per_head, + total_programs, + num_splits, + even_split, + ) = get_num_splits_and_buffer_sizes( + N_CTX_Q, N_CTX_K, H, H, HEAD_DIM_Q, BLOCK_M, BLOCK_N, total_programs + ) + + kv_shape = k.shape[1] // BLOCK_N + (1 if k.shape[1] % BLOCK_N != 0 else 0) + + grid = (total_programs, 1, 1) + + o = torch.empty_like(q, dtype=v.dtype) + + la_persistent_paged[grid]( + q, + k, + v, + qk_scale, + Mp, + Lp, + Op, + o, + kv_block_tables, + kv_shape, + batch_num_block_n, + locks, + q.stride(0), + q.stride(1), + q.stride(2), + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + Op.stride(0), + Op.stride(1), + Op.stride(2), + HEAD_DIM=HEAD_DIM_K, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + batch_size=batch_size, + num_m_blocks=num_m_blocks, + # leanAttention params + high_load_wgs=high_load_wgs, + max_tiles_per_wg=max_tiles_per_wg, + tiles_per_head=tiles_per_head, + num_splits=num_splits, + waves_per_eu=waves_per_eu, + num_warps=waves_per_eu, + ) + + return o + + +def get_num_splits_and_buffer_sizes( + max_seqlen_q, + max_seqlen_k, + num_heads, + num_heads_k, + head_size, + BLOCK_M, + BLOCK_N, + num_SMs, +): + ##### Lean Atteion: Calculate Splits and Tile Sizes ##### + ## based on onnxruntime/contrib_ops/cuda/bert/lean_attention + num_m_blocks = (max_seqlen_q + BLOCK_M - 1) // BLOCK_M + num_n_blocks = (max_seqlen_k + BLOCK_N - 1) // BLOCK_N + + max_seqlen_q = max_seqlen_q * num_heads // num_heads_k + + tiles_per_head = 0 + tiles_per_head = num_m_blocks * num_n_blocks + + total_tiles = tiles_per_head * num_heads_k # Total tiles across all heads + + # StreamK Lean has as many threadblocks as SMs + # This should be a function of tile size and number of scratchpad space + # LeanAttention assign 3 CTAs per SM (bounded by LDS size) + lean_griddimz = num_SMs # CTA launch grid + + # Max number lean tiles per task block (CTA) + max_tiles_per_tb = (total_tiles + lean_griddimz - 1) // lean_griddimz + + # Find max number of splits + num_splits = 0 + even_split = False + if total_tiles % lean_griddimz == 0: + even_split = True + num_splits = 1 + ((num_n_blocks + max_tiles_per_tb - 2) // (max_tiles_per_tb)) + else: + even_split = False + num_splits = 1 + ( + (num_n_blocks + max_tiles_per_tb - 3) // (max_tiles_per_tb - 1) + ) + + # high_load_tbs is the remainder of total_tile / num_cta + high_load_tbs = total_tiles - ((max_tiles_per_tb - 1) * lean_griddimz) + + return ( + num_m_blocks, + high_load_tbs, + max_tiles_per_tb, + tiles_per_head, + lean_griddimz, + num_splits, + even_split, + ) + + +@triton.jit +def find_group(x): + group_id = 0 + total_blocks = 0 + while total_blocks + (group_id + 1) <= x: + total_blocks += group_id + 1 + group_id += 1 + group_size = group_id + 1 + return group_id, group_size, total_blocks + + +@triton.jit +def la_persistent_paged( + Q, + K, + V, + qk_scale, + Mp, + Lp, + Op, + Out, + kv_block_tables, + kv_shape, + batch_num_block_n, + locks, + stride_qh, + stride_qm, + stride_qk, + stride_kh, + stride_kn, + stride_kk, + stride_vh, + stride_vn, + stride_vk, + stride_oh, + stride_om, + stride_on, + stride_oph, + stride_opm, + stride_opn, + HEAD_DIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + batch_size: tl.constexpr, + num_m_blocks: tl.constexpr, + # leanAttention params + high_load_wgs: tl.constexpr, + max_tiles_per_wg: tl.constexpr, + tiles_per_head: tl.constexpr, + num_splits: tl.constexpr, +): + current_pid = tl.program_id(0) + + if current_pid < high_load_wgs: + iter = max_tiles_per_wg * current_pid + cta_end_tile_gid = iter + max_tiles_per_wg + else: + iter = (max_tiles_per_wg - 1) * ( + current_pid - high_load_wgs + ) + high_load_wgs * max_tiles_per_wg + cta_end_tile_gid = iter + (max_tiles_per_wg - 1) + + # Loop context length + while iter < cta_end_tile_gid: + # Calculate index of current head output tile + # The tiles_per_head is the numner of BLOCK_N in the K/V sequence + tile_head_idx = iter // tiles_per_head + + # To generate an otuput tile, a loop over [tile_iter, tile_iter_end) lean tiles is needed + # [tile_iter, tile_iter_end) are in the form of global tile id + tile_idx = tile_head_idx * batch_size + tile_iter = tile_head_idx * tiles_per_head + if batch_size == 1: + req_size = tiles_per_head + else: + req_size = tl.load(batch_num_block_n) + tile_iter_end = tile_iter + req_size + for b in range(1, batch_size): + next_req_size = tl.load(batch_num_block_n + b) + local_head_iter = iter % tiles_per_head + if (local_head_iter < next_req_size) and (local_head_iter >= req_size): + tile_iter = tile_iter + req_size + tile_idx = tile_idx + b + tile_iter_end = tile_iter + (next_req_size - req_size) + req_size = next_req_size + # Local lean tile ID within a loop of an output tile + local_iter = iter - tile_iter + local_iter_end = tl.minimum(tile_iter_end, cta_end_tile_gid) - tile_iter + + if iter == tile_iter: + host_block = True + else: + host_block = False + # finishing_block: the output tile is finished within this block + if cta_end_tile_gid >= tile_iter_end: + finishing_block = True + else: + finishing_block = False + + KV_block_tables_ptr = kv_block_tables + iter + kv_offset = tile_head_idx * stride_kh + + K_base = K + kv_offset + V_base = V + kv_offset + + Q_base = Q + tile_idx * (stride_qh // batch_size) + + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) + + acc, l_i, m_i = _attn_lean_tile( + acc, + l_i, + m_i, + Q_base, + stride_qm, + stride_qk, + kv_shape, + K_base, + V_base, + KV_block_tables_ptr, + stride_kn, + stride_kk, + stride_vn, + stride_vk, + qk_scale, + BLOCK_M, + BLOCK_N, + HEAD_DIM, + tile_idx, + local_iter, + local_iter_end, + ) + # initialize pointer to m and l + m_cta = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_cta = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + acc_cta = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) + + # lean output tile epilogue + offs_m = tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, HEAD_DIM) + + if not host_block: + # Update pointers of partial results M[cta], L[cta], O[cta] + mp_ptrs = Mp + current_pid * BLOCK_M + offs_m + lp_ptrs = Lp + current_pid * BLOCK_M + offs_m + op_ptrs = ( + Op + + current_pid * stride_oph + + offs_m[:, None] * stride_opm + + offs_k[None, :] * stride_opn + ) + + tl.store(mp_ptrs, m_i, cache_modifier=".wt") + tl.store(lp_ptrs, l_i, cache_modifier=".wt") + tl.store(op_ptrs, acc, cache_modifier=".wt") + tl.debug_barrier() + # According to streamK gemm, store + cache_modifier won't work universally + # atomic_xchg is better solution but a less performant variant + tl.atomic_xchg(locks + current_pid, 1) + + if host_block and finishing_block: + # A host block that is also a finishing block completes all the LeanTile iterations for its output tile + # in a single CTA and so can directly store its results from LeanTile() in global memory without any reduction + o_h_offs = Out + tile_idx * (stride_oh // batch_size) + o_ptrs = ( + o_h_offs + offs_m[:, None] * stride_om + offs_k[None, :] * stride_on + ) + acc = acc / l_i[:, None] + tl.store(o_ptrs, acc.to(Out.type.element_ty)) + + if host_block and not finishing_block: + # if not finishing_block: # another CTA is processing the end of the output tile and store partial results + o_h_offs = Out + tile_idx * (stride_oh // batch_size) + o_ptrs = ( + o_h_offs + offs_m[:, None] * stride_om + offs_k[None, :] * stride_on + ) + + last_cta = current_pid + 1 + temp_end_gid = cta_end_tile_gid + split = 1 + while (split < num_splits) and (temp_end_gid < tile_iter_end): + if last_cta < high_load_wgs: + if (tile_iter_end - temp_end_gid) < max_tiles_per_wg: + temp_end_gid += tile_iter_end - temp_end_gid + else: + temp_end_gid += max_tiles_per_wg + else: + if (tile_iter_end - temp_end_gid) < (max_tiles_per_wg - 1): + temp_end_gid += tile_iter_end - temp_end_gid + else: + temp_end_gid += max_tiles_per_wg - 1 + + last_cta += 1 + split += 1 + # Next, load nonHost partial restult + for cta in range((current_pid + 1), last_cta): + # According to treamK gemm, atomic_cas is universal solution but less performant + while tl.atomic_cas(locks + cta, 1, 1) != 1: + pass + + # Partial results are stored in [nonHost, Host-nonFinishing] layout + offs_mplp = cta * BLOCK_M + tl.arange(0, BLOCK_M) + mp_ptrs = Mp + offs_mplp + lp_ptrs = Lp + offs_mplp + op_h_offs = Op + cta * stride_oph + op_ptrs = ( + op_h_offs + + offs_m[:, None] * stride_opm + + offs_k[None, :] * stride_opn + ) + m_cta = tl.load(mp_ptrs) + l_cta = tl.load(lp_ptrs) + acc_cta = tl.load(op_ptrs) + + # m_i is the host CTA's m, m_cta is other nonHost CTA's m + m_new = tl.maximum(m_cta, m_i) + alpha = tl.math.exp2(m_cta - m_new) + alpha1 = tl.math.exp2(m_i - m_new) + l_new = alpha * l_cta + alpha1 * l_i + acc = acc_cta * alpha[:, None] + acc * alpha1[:, None] + # update m, l + m_i = m_new + l_i = l_new + # host non-finishing CTA write final result to memory + acc = acc / l_i[:, None] + tl.store(o_ptrs, acc.to(Out.type.element_ty)) + + # update iter + iter = iter + (local_iter_end - local_iter) + + +@triton.jit +def _attn_lean_tile( + acc, + l_i, + m_i, + Q_base, + stride_qm, + stride_qk, + kv_shape, + K_base, + V_base, + KV_block_tables_ptr, + stride_kn, + stride_kk, + stride_vn, + stride_vk, + qk_scale: tl.constexpr, # + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + HEAD_DIM: tl.constexpr, + tile_idx, + local_iter, + local_iter_end, +): # + Q_block_ptr = tl.make_block_ptr( + base=Q_base, + shape=(BLOCK_M, HEAD_DIM), + strides=(stride_qm, stride_qk), + offsets=(0, 0), + block_shape=(BLOCK_M, HEAD_DIM), + order=(1, 0), + ) + + q = tl.load(Q_block_ptr) + + K_block_ptr = tl.make_block_ptr( + base=K_base, + shape=(HEAD_DIM, kv_shape), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(HEAD_DIM, BLOCK_N), + order=(0, 1), # K parent tensor shape [Z, H, CTX, HEAD_DIM] + ) + V_block_ptr = tl.make_block_ptr( + base=V_base, + shape=(kv_shape, HEAD_DIM), + strides=(stride_vn, stride_vk), + offsets=(0, 0), + block_shape=(BLOCK_N, HEAD_DIM), + order=(1, 0), + ) + + for iter in range(local_iter, local_iter_end): + # update k/v pointer + kv_block_id = tl.load(KV_block_tables_ptr, cache_modifier=".cg") + V_bptr = tl.advance(V_block_ptr, (kv_block_id * BLOCK_N, 0)) + K_bptr = tl.advance(K_block_ptr, (0, kv_block_id * BLOCK_N)) + + # -- compute qk ---- + k = tl.load(K_bptr, cache_modifier=".cg") + qk = tl.dot(q, k) + qk = qk * qk_scale + + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk = qk - m_ij[:, None] + p = tl.math.exp2(qk) # p.shape = [BLOCK_M, BLOCK_N] + # -- update output accumulator -- + alpha = tl.math.exp2(m_i - m_ij) + acc = ( + acc * alpha[:, None] + ) # Scale each row of acc by the corresponding elements in alpha + v = tl.load(V_bptr, cache_modifier=".cg") # v.shape = [BLOCK_N, HEAD_DIM] + acc += tl.dot(p.to(v.dtype), v) # acc.shape = [BLOCK_M, HEAD_DIM] + # -- update l_i + l_ij = tl.sum(p, 1) # rowsum(p) + l_i = l_i * alpha + l_ij + # update m_i + m_i = m_ij.to(m_i.dtype) + + # update KV block tables pointer + KV_block_tables_ptr += 1 + + return acc, l_i, m_i diff --git a/aiter/ops/triton/mha.py b/aiter/ops/triton/mha.py new file mode 100644 index 0000000000000000000000000000000000000000..91625a4449611f1c2970429a19046df577c58e87 --- /dev/null +++ b/aiter/ops/triton/mha.py @@ -0,0 +1,2018 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional, Tuple +import functools +import json +import torch +import triton +import triton.language as tl + +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter.ops.triton.utils.pid_preprocessing import remap_xcd +from aiter.ops.triton.mha_onekernel_bwd import flash_attn_onekernel_backward +from aiter.ops.triton.mha_fused_bwd import flash_attn_fused_backward +from aiter.ops.triton.utils.mha_kernel_utils import ( + _compute_fp8_scaling_factors, + _is_fp8, +) + +global _USE_FUSED_BWD_KERNEL +_USE_FUSED_BWD_KERNEL = False + + +def mha_set_use_fused_bwd_kernel(value: bool): + global _USE_FUSED_BWD_KERNEL + _USE_FUSED_BWD_KERNEL = value + + +_USE_INT64_STRIDES = True + + +def mha_set_use_int64_strides(value: bool): + """Use 64-bit integer strides to prevent integer overflows with very large tensors.""" + global _USE_INT64_STRIDES + _USE_INT64_STRIDES = value + + +def _cast_to_fp8( + x: torch.Tensor, + fp8_dtype, + layout, + clamp_val=1e-9, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Convert a tensor to FP8 format, returning an FP8 tensor and a descale factor. + Args: + - x (torch.Tensor): shape [batch, seq_len, heads, dim] + Returns: + - x_fp8 (torch.Tensor): FP8 tensor with the same shape as x + - descale_factor (torch.Tensor): tensor of shape [batch, 1, heads, 1] + """ + if len(x.shape) != 4: + raise ValueError( + f"'bshd' tensor should have shape [batch, seqlen, heads, dim], got {x.shape}" + ) + reduce_dims = (1, 3) # seq_len and dim dimensions + + # Compute the absolute max along reduce_dims, clamped to avoid 0-scale + x_abs_max = x.abs().amax(dim=reduce_dims) + x_abs_max = torch.maximum(x_abs_max, x.new_tensor(clamp_val)) + + # Unsqueeze back to a shape suitable for broadcast + unsqueeze_dims = sorted(reduce_dims) + for d in unsqueeze_dims: + x_abs_max = x_abs_max.unsqueeze(d) + + # compute scale and descale + fp8_max = torch.finfo(fp8_dtype).max + scale = fp8_max / x_abs_max + descale_factor = x_abs_max / fp8_max + + # cast to FP8, optionally setting requires_grad + x_fp8 = (x * scale).to(fp8_dtype) + + return x_fp8, descale_factor + + +def _cast_varlen_to_fp8( + x: torch.Tensor, + fp8_dtype: torch.dtype, + cu_seqlens, + clamp_val: float = 1e-9, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Convert a tensor of sequences with variable seq_len into fp8. + Args: + - x (torch.Tensor): shape [total_seq_len, heads, dim] + Returns: + - x_fp8 (torch.Tensor): shape [total_seq_len, heads, dim] + - descale_factors (torch.Tensor): shape [batch, heads] + """ + # validate tensor shape + if len(x.shape) != 3: + raise ValueError( + f"tensor should have shape [total_seqlen, heads, dim], got {x.shape}" + ) + num_heads = x.shape[1] + + # Get batch size from cu_seqlens + batch = cu_seqlens.shape[0] - 1 + fp8_max = torch.finfo(fp8_dtype).max + + # Compute scale and descale factors per sequence + x_fp8 = torch.zeros_like(x, dtype=fp8_dtype) + descale_factors = torch.zeros( + (batch, num_heads), device=x.device, dtype=torch.float32 + ) + + for i in range(batch): + start = cu_seqlens[i] + end = cu_seqlens[i + 1] + x_slice = x[start:end] # Slice for current sequence + + # Standard tensor (0: seq_len, 2: head_dim) + x_abs_max = x_slice.abs().amax(dim=(0, 2)) # [heads] + + # apply minimum clamping + x_abs_max = torch.maximum(x_abs_max, x.new_tensor(clamp_val)) + + # compute scale and descale factors + scale_i = fp8_max / x_abs_max + descale_i = x_abs_max / fp8_max + + # store descale factors + descale_factors[i, :] = descale_i + + scale_reshape = scale_i.reshape(1, num_heads, 1) + + # scale and cast to FP8 + x_fp8[start:end] = (x_slice * scale_reshape).to(fp8_dtype) + + return x_fp8, descale_factors + + +@triton.jit +def _cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def _load_fn(ptrs, offset_first, offset_second, boundary_first, boundary_second): + if offset_first is not None and offset_second is not None: + mask = (offset_first[:, None] < boundary_first) & ( + offset_second[None, :] < boundary_second + ) + tensor = tl.load(ptrs, mask=mask, other=0.0) + elif offset_first is not None: + mask = offset_first[:, None] < boundary_first + tensor = tl.load(ptrs, mask=mask, other=0.0) + elif offset_second is not None: + mask = offset_second[None, :] < boundary_second + tensor = tl.load(ptrs, mask=mask, other=0.0) + else: + tensor = tl.load(ptrs) + return tensor + + +@triton.jit +def _compute_alibi_block( + alibi_slope, seqlen_q, seqlen_k, offs_m, offs_n, transpose=False +): + # when seqlen_k and seqlen_q are different we want the diagonal to stick to the bottom right of the attention matrix + # for casual mask we want something like this where (1 is kept and 0 is masked) + # seqlen_q = 2 and seqlen_k = 5 + # 1 1 1 1 0 + # 1 1 1 1 1 + # seqlen_q = 5 and seqlen_k = 2 + # 0 0 + # 0 0 + # 0 0 + # 1 0 + # 1 1 + # for alibi the diagonal is 0 indicating no penalty for attending to that spot and increasing penalty for attending further from the diagonal + # e.g. alibi_slope = 1, seqlen_q = 2, seqlen_k = 5, offs_m = [0, 1, 2, 3], offs_n = [0, 1, 2, 3, 4], transpose = False + # 1. offs_m[:,None] = [[0], + # [1], + # 2. offs_m[:,None] + seqlen_k = [[5], + # [6], + # 3. offs_m[:,None] + seqlen_k - seqlen_q = [[3], + # [4], + # 4. offs_m[:,None] + seqlen_k - seqlen_q - offs_n[None,:] = [[3], - [[0, 1, 2, 3, 4]] = [[ 3, 2, 1, 0,-1], + # [4], [ 4, 3, 2, 1, 0]] + # 5. -1 * alibi_slope * tl.abs(relative_pos_block) = [[ -3, -2, -1, 0,-1], + # [ -4, -3, -2, -1, 0]], + relative_pos_block = offs_m[:, None] + seqlen_k - seqlen_q - offs_n[None, :] + alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block) + if transpose: + return alibi_block.T + else: + return alibi_block + + +@triton.jit +def _attn_fwd_inner( + acc, + l_i, + m_i, + q, + k_ptrs, + v_ptrs, + stride_kn, + stride_vk, + stride_sn, + start_m, + seqlen_k, + seqlen_q, + dropout_p, + sd_mask_ptrs, + dropout_mask_ptrs, + philox_seed, + philox_ptrs, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + alibi_slope, + descale_q, + descale_k, + descale_v, + OFFS_M: tl.constexpr, + OFFS_N: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DMODEL_POW2: tl.constexpr, + SM_SCALE: tl.constexpr, + IS_CAUSAL: tl.constexpr, + MASK_STEPS: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_SCORES: tl.constexpr, + PADDED_HEAD: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, +): + RCP_LN2: tl.constexpr = 1.4426950408889634 + + # loop over k, v, and update accumulator + + for start_n in range(block_min, block_max, BLOCK_N): + # For padded blocks, we will overrun the tensor size if + # we load all BLOCK_N. For others, the blocks are all within range. + if MASK_STEPS: + k_offs_n = start_n + tl.arange(0, BLOCK_N) + else: + k_offs_n = None + k_offs_k = None if not PADDED_HEAD else tl.arange(0, BLOCK_DMODEL_POW2) + k = _load_fn(k_ptrs, k_offs_k, k_offs_n, BLOCK_DMODEL, seqlen_k) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + # We start from end of seqlen_k so only the first iteration would need + # to be checked for padding if it is not a multiple of block_n + # TODO: This can be optimized to only be true for the padded block. + mask = tl.full([BLOCK_M, BLOCK_N], True, dtype=tl.int1) + if MASK_STEPS: + # If this is the last block / iteration, we want to + # mask if the sequence length is not a multiple of block size + # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not is_modulo_mn. + # last step might get wasted but that is okay. check if this masking works For + # that case. + + # remove the old if condition + # if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0): + # Though this will unconditionally compute mask_partial at runtime, + # the causal for loop does not have the if-else block any more, which + # helps instruction scheduling and register pressure. + bound_cond = (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0) + boundary_m = tl.full([BLOCK_M], seqlen_k, dtype=tl.int32) + size_n = start_n + OFFS_N[None, :] + mask_partial = size_n < boundary_m[:, None] + mask = tl.where(bound_cond, mask_partial, mask) + + # compute masks + q_mask = OFFS_M[:, None] < seqlen_q + k_mask = (start_n + tl.arange(0, BLOCK_N))[None, :] < seqlen_k + p_mask = q_mask & k_mask + + # -- compute qk ---- + if IS_FP8: + qk += tl.dot(q, k) * descale_q * descale_k + else: + qk += tl.dot(q, k) + + if IS_CAUSAL: + causal_boundary = start_n + offs_n_causal + causal_mask = OFFS_M[:, None] >= causal_boundary[None, :] + mask = mask and causal_mask + + qk = tl.where(mask, qk, float("-inf")) + + if alibi_slope is not None: + # Compute the global position of each token within the sequence + global_m_positions = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + global_n_positions = start_n + tl.arange(0, BLOCK_N) + alibi_block = _compute_alibi_block( + alibi_slope, seqlen_q, seqlen_k, global_m_positions, global_n_positions + ) + qk += alibi_block / SM_SCALE + # get max scores so far + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + m_ij_scaled = m_ij * SM_SCALE * RCP_LN2 + + # scale and subtract max + q_shifted = qk * SM_SCALE * RCP_LN2 - m_ij_scaled[:, None] + + # Compute scaled QK and softmax probabilities + p = tl.math.exp2(q_shifted) + + # CAVEAT: Must update l_ij before applying dropout + l_ij = tl.sum(p, 1) + if ENABLE_DROPOUT: + rng_output = tl.rand( + philox_seed, philox_ptrs + ) # TODO: use tl.randint for better performance + dropout_mask = rng_output > dropout_p + tl.store(dropout_mask_ptrs, dropout_mask, mask=p_mask) + + # return scores with negative values for dropped vals + sd_mask = tl.where(dropout_mask, p, -p) + tl.store(sd_mask_ptrs, sd_mask, mask=p_mask) + + # apply dropout mask in place + p = tl.where(dropout_mask, p, 0.0) + elif RETURN_SCORES: + # NOTE: the returned score is not the same as the reference because we need to adjust as we find new maxes per block. We are not doing that + tl.store(sd_mask_ptrs, p, mask=p_mask) + + # -- update output accumulator -- + # alpha is an adjustment factor for acc and li as we loop and find new maxes + # store the diff in maxes to adjust acc and li as we discover new maxes + m_diff_scaled = m_i * SM_SCALE * RCP_LN2 - m_ij_scaled + alpha = tl.math.exp2(m_diff_scaled) + acc = acc * alpha[:, None] + v = _load_fn(v_ptrs, k_offs_n, k_offs_k, seqlen_k, BLOCK_DMODEL) + # -- update m_i and l_i + l_i = l_i * alpha + l_ij + # update m_i and l_i + m_i = m_ij + + if IS_FP8: + scale_p, descale_p = _compute_fp8_scaling_factors(p, FP8_MAX) + acc += ( + tl.dot((p * scale_p).to(v.type.element_ty), v) * descale_p * descale_v + ) + else: + acc += tl.dot(p.to(v.type.element_ty), v) + + k_ptrs += BLOCK_N * stride_kn + v_ptrs += BLOCK_N * stride_vk + if RETURN_SCORES: + sd_mask_ptrs += BLOCK_N * stride_sn + + if ENABLE_DROPOUT: + dropout_mask_ptrs += BLOCK_N * stride_sn + philox_ptrs += BLOCK_N * stride_sn + + return acc, l_i, m_i + + +@triton.jit +def _attn_fwd( + q_ptr: torch.Tensor, + k_ptr: torch.Tensor, + v_ptr: torch.Tensor, + descale_q_ptr: torch.Tensor, + descale_k_ptr: torch.Tensor, + descale_v_ptr: torch.Tensor, + out_ptr: torch.Tensor, + alibi_slopes_ptr: torch.Tensor, + s_dmask_ptr: torch.Tensor, + dropout_mask_ptr: torch.Tensor, + softmax_lse_ptr: torch.Tensor, + stride_qz_in, + stride_qh_in, + stride_qm_in, + stride_qk_in, + stride_kz_in, + stride_kh_in, + stride_kn_in, + stride_kk_in, + stride_vz_in, + stride_vh_in, + stride_vn_in, + stride_vk_in, + stride_descale_q_z_in, + stride_descale_k_z_in, + stride_descale_v_z_in, + stride_oz_in, + stride_oh_in, + stride_om_in, + stride_on_in, + stride_alibi_z_in, + stride_alibi_h_in, + stride_sd_z_in, + stride_sd_h_in, + stride_sd_m_in, + stride_sd_n_in, + stride_lse_z_in, + stride_lse_h_in, + stride_lse_m_in, + sm_scale, + cu_seqlens_q, + cu_seqlens_k, + dropout_p, + philox_seed, + philox_offset_base_in, + SEQLEN_Q: tl.constexpr, + SEQLEN_K: tl.constexpr, + IS_CAUSAL: tl.constexpr, + NUM_Q_HEADS: tl.constexpr, + NUM_K_HEADS: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DMODEL_POW2: tl.constexpr, + RETURN_SCORES: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + VARLEN: tl.constexpr, + BATCH, + NUM_XCD: tl.constexpr, + USE_INT64_STRIDES: tl.constexpr, +): + NUM_BLOCKS = (SEQLEN_Q + BLOCK_M - 1) // BLOCK_M + # calculate offsets + wid = tl.program_id( + 0 + ) # workgroup id ranging: 0,1,2,...., (BATCH * NUM_Q_HEADS * NUM_BLOCKS - 1) + # num blocks along seqlen + + off_q_head = wid % NUM_Q_HEADS + off_q_head = remap_xcd(off_q_head, NUM_Q_HEADS, NUM_XCD) + start_m = (wid // NUM_Q_HEADS) % NUM_BLOCKS + off_z = (wid // (NUM_BLOCKS * NUM_Q_HEADS)) % BATCH + + # offsets + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL_POW2) + + # NOTE: + # Workaround for int64 strides, In the absence of strides being int64, parts of the offset + # computation is done in 32 bit and overflows resulting in segfaults + # If input strides are defined as int64, it disables vectorized loads which drops perf + # If we define new strides as stride_x = stride_x_in.to(tl.int64), that does not work + # because strides are tl.constexpr and cannot be upcasted + # If we define new strides as stride_x: tl.int64 = stride_x_in, segfault remains + # The permanent solution is to enable upcasting of tl.constexpr + # In the meantime, the following workaround provides correctness and does not drop perf + if USE_INT64_STRIDES: + stride_qz = tl.cast(stride_qz_in, tl.int64) + stride_qh = tl.cast(stride_qh_in, tl.int64) + stride_qm = tl.cast(stride_qm_in, tl.int64) + stride_qk = tl.cast(stride_qk_in, tl.int64) + stride_kz = tl.cast(stride_kz_in, tl.int64) + stride_kh = tl.cast(stride_kh_in, tl.int64) + stride_kn = tl.cast(stride_kn_in, tl.int64) + stride_kk = tl.cast(stride_kk_in, tl.int64) + stride_vz = tl.cast(stride_vz_in, tl.int64) + stride_vh = tl.cast(stride_vh_in, tl.int64) + stride_vn = tl.cast(stride_vn_in, tl.int64) + stride_vk = tl.cast(stride_vk_in, tl.int64) + if IS_FP8: + stride_descale_q_z = tl.cast(stride_descale_q_z_in, tl.int64) + stride_descale_k_z = tl.cast(stride_descale_k_z_in, tl.int64) + stride_descale_v_z = tl.cast(stride_descale_v_z_in, tl.int64) + stride_oz = tl.cast(stride_oz_in, tl.int64) + stride_oh = tl.cast(stride_oh_in, tl.int64) + stride_om = tl.cast(stride_om_in, tl.int64) + stride_on = tl.cast(stride_on_in, tl.int64) + stride_alibi_z = tl.cast(stride_alibi_z_in, tl.int64) + stride_alibi_h = tl.cast(stride_alibi_h_in, tl.int64) + + # NOTE: philox offset is need in dropout pointer calculations + philox_offset_base = tl.cast(philox_offset_base_in, tl.int64) + stride_sd_z = tl.cast(stride_sd_z_in, tl.int64) + stride_sd_h = tl.cast(stride_sd_h_in, tl.int64) + stride_sd_m = tl.cast(stride_sd_m_in, tl.int64) + stride_sd_n = tl.cast(stride_sd_n_in, tl.int64) + stride_lse_z = tl.cast(stride_lse_z_in, tl.int64) + stride_lse_h = tl.cast(stride_lse_h_in, tl.int64) + stride_lse_m = tl.cast(stride_lse_m_in, tl.int64) + else: + stride_qz = stride_qz_in + stride_qm = stride_qm_in + stride_qk = stride_qk_in + stride_qh = stride_qh_in + stride_kz = stride_kz_in + stride_kh = stride_kh_in + stride_kn = stride_kn_in + stride_kk = stride_kk_in + stride_vz = stride_vz_in + stride_vh = stride_vh_in + stride_vn = stride_vn_in + stride_vk = stride_vk_in + stride_descale_q_z = stride_descale_q_z_in + stride_descale_k_z = stride_descale_k_z_in + stride_descale_v_z = stride_descale_v_z_in + stride_oz = stride_oz_in + stride_oh = stride_oh_in + stride_om = stride_om_in + stride_on = stride_on_in + stride_alibi_z = stride_alibi_z_in + stride_alibi_h = stride_alibi_h_in + philox_offset_base = philox_offset_base_in + stride_sd_z = stride_sd_z_in + stride_sd_h = stride_sd_h_in + stride_sd_m = stride_sd_m_in + stride_sd_n = stride_sd_n_in + stride_lse_z = stride_lse_z_in + stride_lse_h = stride_lse_h_in + stride_lse_m = stride_lse_m_in + + if VARLEN: + cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z) + cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1) + + seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start + # We have a one-size-fits-all grid in id(0). Some seqlens might be too + # small for all start_m so for those we return early. + if start_m * BLOCK_M > seqlen_q: + return + cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z) + cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1) + seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start + else: + cu_seqlens_q_start = 0 + cu_seqlens_k_start = 0 + seqlen_q = SEQLEN_Q + seqlen_k = SEQLEN_K + + n_blocks = _cdiv_fn(seqlen_k, BLOCK_N) + + # Now we compute whether we need to exit early due to causal masking. + # This is because for seqlen_q > seqlen_k, M rows of the attn scores + # are completely masked, resulting in 0s written to the output, and + # inf written to LSE. We don't need to do any GEMMs in this case. + # This block of code determines what N is, and if this WG is operating + # on those M rows. + if IS_CAUSAL: + # If seqlen_q == seqlen_k, the attn scores are a square matrix. + # If seqlen_q != seqlen_k, attn scores are rectangular which means + # the causal mask boundary is bottom right aligned, and ends at either + # the top edge (seqlen_q < seqlen_k) or left edge. + + # This captures the decrease in n_blocks if we have a rectangular attn matrix + n_blocks_seqlen = _cdiv_fn( + (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N + ) + + # This is what adjusts the block_max for the current WG, only + # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks + n_blocks = min(n_blocks, n_blocks_seqlen) + + # If we have no blocks after adjusting for seqlen deltas, this WG is part of + # the blocks that are all 0. We exit early. + if n_blocks <= 0: + offs_out = ( + off_z * stride_oz + + off_q_head * stride_oh + + cu_seqlens_q_start * stride_om + + offs_m[:, None] * stride_om + + offs_d[None, :] * stride_on + ) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_POW2], dtype=out_ptr.type.element_ty) + out_mask = (offs_m[:, None] < seqlen_q) & (offs_d[None, :] < BLOCK_DMODEL) + tl.store(out_ptr + offs_out, acc, mask=out_mask) + + if softmax_lse_ptr is not None: + offs_lse = ( + off_z * stride_lse_z + + off_q_head * stride_lse_h + + cu_seqlens_q_start * stride_lse_m + + offs_m * stride_lse_m + ) + lse_mask = offs_m < SEQLEN_Q + lse = tl.full([BLOCK_M], value=0.0, dtype=tl.float32) + tl.store(softmax_lse_ptr + offs_lse, lse, mask=lse_mask) + # TODO: Should dropout and return encoded softmax be handled here too? + + return + + grp_sz: tl.constexpr = NUM_Q_HEADS // NUM_K_HEADS + if grp_sz != 1: # Grouped Query Attention + off_k_head = off_q_head // grp_sz + else: + off_k_head = off_q_head + + # q,k,v offsets + q_offs = ( + off_z * stride_qz + + off_q_head * stride_qh + + cu_seqlens_q_start * stride_qm + + offs_m[:, None] * stride_qm + + offs_d[None, :] * stride_qk + ) + q_ptrs = q_ptr + q_offs + + k_offs = ( + off_z * stride_kz + + off_k_head * stride_kh + + cu_seqlens_k_start * stride_kn + + offs_d[:, None] * stride_kk + + offs_n[None, :] * stride_kn + ) + k_ptrs = k_ptr + k_offs + + v_offs = ( + off_z * stride_vz + + off_k_head * stride_vh + + cu_seqlens_k_start * stride_vn + + offs_n[:, None] * stride_vn + + offs_d[None, :] * stride_vk + ) + v_ptrs = v_ptr + v_offs + + # alibi slopes + if alibi_slopes_ptr is not None: + alibi_offs = off_z * stride_alibi_z + off_q_head * stride_alibi_h + alibi_slope = tl.load(alibi_slopes_ptr + alibi_offs) + else: + alibi_slope = None + + # s_dmask (return_scores) + if s_dmask_ptr is not None: + s_dmask_offs = ( + off_z * stride_sd_z + + off_q_head * stride_sd_h + + offs_m[:, None] * stride_sd_m + + offs_n[None, :] * stride_sd_n + ) + s_dmask_ptrs = s_dmask_ptr + s_dmask_offs + else: + s_dmask_ptrs = None + + # dropout + if dropout_mask_ptr is not None: + dropout_mask_offs = ( + off_z * stride_sd_z + + off_q_head * stride_sd_h + + offs_m[:, None] * stride_sd_m + + offs_n[None, :] * stride_sd_n + ) + dropout_mask_ptrs = dropout_mask_ptr + dropout_mask_offs + philox_ptrs = ( + philox_offset_base + + off_z * stride_sd_z + + off_q_head * stride_sd_h + + offs_m[:, None] * stride_sd_m + + offs_n[None, :] * stride_sd_n + ) + else: + dropout_mask_ptrs = None + philox_ptrs = None + + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_POW2], dtype=tl.float32) + if BLOCK_DMODEL == BLOCK_DMODEL_POW2: + q_mask = offs_m[:, None] < seqlen_q + else: + q_mask = (offs_m[:, None] < seqlen_q) & (offs_d[None, :] < BLOCK_DMODEL) + q = tl.load(q_ptrs, mask=q_mask, other=0.0) + if IS_FP8: + descale_q = tl.load(descale_q_ptr + off_z * stride_descale_q_z + off_q_head) + descale_k = tl.load(descale_k_ptr + off_z * stride_descale_k_z + off_k_head) + descale_v = tl.load(descale_v_ptr + off_z * stride_descale_v_z + off_k_head) + else: + descale_q, descale_k, descale_v = 1.0, 1.0, 1.0 + + n_extra_tokens = 0 + if seqlen_k < BLOCK_N: + n_extra_tokens = BLOCK_N - seqlen_k + elif seqlen_k % BLOCK_N: + n_extra_tokens = seqlen_k % BLOCK_N + + # if CAUSAL, then determine masked_blocks and full blocks + # Here we compute how many full and masked blocks we have. + padded_block_k = n_extra_tokens != 0 + is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) + if IS_CAUSAL: + # There are always at least BLOCK_M // BLOCK_N masked blocks. + # Additionally there might be one more due to dissimilar seqlens. + masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) + else: + # Padding on Q does not need to be masked in the FA loop. + masked_blocks = padded_block_k + # if IS_CAUSAL, not is_modulo_mn does not always result in an additional block. + # In this case we might exceed n_blocks so pick the min. + masked_blocks = min(masked_blocks, n_blocks) + n_full_blocks = n_blocks - masked_blocks + block_min = 0 + block_max = n_blocks * BLOCK_N + # Compute for full blocks. Here we set causal to false regardless of its actual + # value because there is no masking. Similarly we do not need padding. + if n_full_blocks > 0: + block_max = (n_blocks - masked_blocks) * BLOCK_N + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + k_ptrs, + v_ptrs, + stride_kn, + stride_vn, + stride_sd_n, + start_m, + seqlen_k, + seqlen_q, + dropout_p, + s_dmask_ptrs, + dropout_mask_ptrs, + philox_seed, + philox_ptrs, + block_min, + block_max, + 0, + 0, + 0, + alibi_slope, + descale_q, + descale_k, + descale_v, + offs_m, + offs_n, + BLOCK_M, + BLOCK_N, + BLOCK_DMODEL, + BLOCK_DMODEL_POW2, + sm_scale, + False, + MASK_STEPS=False, + ENABLE_DROPOUT=ENABLE_DROPOUT, + RETURN_SCORES=RETURN_SCORES, + PADDED_HEAD=BLOCK_DMODEL != BLOCK_DMODEL_POW2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + ) + block_min = block_max + block_max = n_blocks * BLOCK_N + + # Remaining blocks, if any, are full / not masked. + if masked_blocks > 0: + if IS_CAUSAL: + offs_n_causal = offs_n + (seqlen_q - seqlen_k) + else: + offs_n_causal = 0 + k_ptrs += n_full_blocks * BLOCK_N * stride_kn + v_ptrs += n_full_blocks * BLOCK_N * stride_vn + if RETURN_SCORES: + s_dmask_ptrs += n_full_blocks * BLOCK_N * stride_sd_n + if ENABLE_DROPOUT: + dropout_mask_ptrs += n_full_blocks * BLOCK_N * stride_sd_n + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + k_ptrs, + v_ptrs, + stride_kn, + stride_vn, + stride_sd_n, + start_m, + seqlen_k, + seqlen_q, + dropout_p, + s_dmask_ptrs, + dropout_mask_ptrs, + philox_seed, + philox_ptrs, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + alibi_slope, + descale_q, + descale_k, + descale_v, + offs_m, + offs_n, + BLOCK_M, + BLOCK_N, + BLOCK_DMODEL, + BLOCK_DMODEL_POW2, + sm_scale, + IS_CAUSAL, + MASK_STEPS=True, + ENABLE_DROPOUT=ENABLE_DROPOUT, + RETURN_SCORES=RETURN_SCORES, + PADDED_HEAD=BLOCK_DMODEL != BLOCK_DMODEL_POW2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + ) + # epilogue + # This helps the compiler do Newton Raphson on l_i vs on acc which is much larger. + l_recip = 1 / l_i[:, None] + acc = acc * l_recip + if ENABLE_DROPOUT: + dropout_scale = 1 / (1 - dropout_p) + acc = acc * dropout_scale + # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M, + # then we have one block with a row of all NaNs which come from computing + # softmax over a row of all -infs (-inf - inf = NaN). We check for that here + # and store 0s where there are NaNs as these rows should've been zeroed out. + end_m_idx = (start_m + 1) * BLOCK_M + start_m_idx = start_m * BLOCK_M + causal_start_idx = seqlen_q - seqlen_k + if IS_CAUSAL: + if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx: + out_mask_boundary = tl.full( + (BLOCK_DMODEL_POW2,), causal_start_idx, dtype=tl.int32 + ) + mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) + out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :] + z = 0.0 + acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) + + # write back LSE(Log Sum Exponents), the log of the normalization constant + overflow_size = end_m_idx - seqlen_q + if softmax_lse_ptr is not None: + RCP_LN2: tl.constexpr = 1.4426950408889634 + LN2: tl.constexpr = 0.6931471824645996 + # compute log-sum-exp in base 2 units + # mi_base2 = m_i * RCP_LN2 + mi_base2 = m_i * RCP_LN2 * sm_scale + softmax_lse = mi_base2 + tl.math.log2(l_i) + # convert back to natural units + softmax_lse *= LN2 + + if IS_CAUSAL: + # zero out nans caused by -infs when doing causal + lse_causal_mask = (start_m_idx + tl.arange(0, BLOCK_M)) < causal_start_idx + softmax_lse = tl.where(lse_causal_mask, 0.0, softmax_lse) + + # If seqlen_q not multiple of BLOCK_M, we need to mask out the last few rows. + # This is only true for the last M block. For others, overflow_size will be -ve + offs_lse = ( + off_z * stride_lse_z + + off_q_head * stride_lse_h + + cu_seqlens_q_start * stride_lse_m + + offs_m * stride_lse_m + ) + if overflow_size > 0: + boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + lse_mask = tl.arange(0, BLOCK_M) < boundary + tl.store( + softmax_lse_ptr + offs_lse, softmax_lse, mask=lse_mask + ) # the log of the normalization constant + else: + tl.store( + softmax_lse_ptr + offs_lse, softmax_lse + ) # the log of the normalization constant + + # write back O + offs_out = ( + off_z * stride_oz + + off_q_head * stride_oh + + cu_seqlens_q_start * stride_om + + offs_m[:, None] * stride_om + + offs_d[None, :] * stride_on + ) + out_mask = tl.full([BLOCK_M, BLOCK_DMODEL_POW2], 1, dtype=tl.int1) + if overflow_size > 0: + out_mask = out_mask & (offs_m[:, None] < seqlen_q) + if BLOCK_DMODEL != BLOCK_DMODEL_POW2: + out_mask = out_mask & (offs_d[None, :] < BLOCK_DMODEL) + op = acc.to(out_ptr.dtype.element_ty) + tl.store(out_ptr + offs_out, op, mask=out_mask) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + enable_dropout: bool, + dtype: torch.dtype, +): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/{dev}-MHA-DEFAULT.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + if enable_dropout or dtype == torch.float32: + return _get_config._config_dict["default"]["fwd"]["dropout_or_fp32"] + else: + return _get_config._config_dict["default"]["fwd"]["default"] + + +def _flash_attn_forward( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + dropout_p: float, + softmax_scale: float, + causal: bool, + window_size_left: int, + window_size_right: int, + bias: Optional[torch.Tensor], + alibi_slopes: Optional[torch.Tensor], + return_lse: bool, + return_softmax: bool, + max_seqlen_q: int, + max_seqlen_k: int, + cu_seqlens_q: Optional[torch.Tensor] = None, + cu_seqlens_k: Optional[torch.Tensor] = None, + descale_q: Optional[torch.Tensor] = None, + descale_k: Optional[torch.Tensor] = None, + descale_v: Optional[torch.Tensor] = None, + config: Optional[dict[str, any]] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + + if bias is not None: + raise ValueError("Bias is not supported yet in the Triton Backend") + if window_size_left != -1 or window_size_right != -1: + raise ValueError("Sliding Window is not supported yet in the Triton Backend") + + # FP8 + IS_FP8 = _is_fp8(q) + FP8_MAX: tl.constexpr = torch.finfo(q.dtype).max + is_varlen = True if cu_seqlens_q is not None else False + + if IS_FP8: + o = torch.zeros_like(q, dtype=torch.float32) + else: + o = torch.zeros_like(q) + if is_varlen: + # Layout for q,k,v is thd ie [total_tokens, num_head, head_dim] + batch, seqlen_q, num_q_heads, head_sz = ( + len(cu_seqlens_q) - 1, + max_seqlen_q, + q.shape[1], + q.shape[2], + ) + seqlen_k, num_k_heads = max_seqlen_k, k.shape[1] + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + else: + # Layout for q,k,v is bshd ie [batch, seq_len, num_head, head_dim] + batch, seqlen_q, num_q_heads, head_sz = q.shape + seqlen_k = k.shape[1] + num_k_heads = k.shape[2] + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) + + # padding for head_dim. Power of 2 or 16 + BLOCK_DMODEL_POW2 = triton.next_power_of_2(head_sz) + BLOCK_DMODEL_POW2 = max(BLOCK_DMODEL_POW2, 16) + + # softmax_lse [batch, num_q_heads, seqlen_q] + if is_varlen: + softmax_lse = torch.zeros( + (q.shape[0], num_q_heads), device=q.device, dtype=torch.float32 + ) + stride_lse_z, stride_lse_h, stride_lse_m = ( + 0, + softmax_lse.stride(1), + softmax_lse.stride(0), + ) + else: + softmax_lse = torch.zeros( + (batch, num_q_heads, max_seqlen_q), device=q.device, dtype=torch.float32 + ) + stride_lse_z, stride_lse_h, stride_lse_m = softmax_lse.stride() + + # exp_scores [batch, num_q_heads, seqlen_q, seqlen_k] + enable_dropout = dropout_p > 0.0 + if enable_dropout: + philox_seed = torch.randint(0, 0xFFFFFF, (1,))[ + 0 + ].item() # No specific reason to restrict range to 0xffffff + philox_offset = torch.randint(0, 0xFFFFFF, (1,))[ + 0 + ].item() # Pass in an int, not Tensor + else: + philox_seed = 0 + philox_offset = 0 + if return_softmax or enable_dropout: + s_dmask = torch.zeros( + (batch, num_q_heads, max_seqlen_q, max_seqlen_k), + device=q.device, + dtype=torch.float32, + ) + dropout_mask = torch.zeros( + (batch, num_q_heads, max_seqlen_q, max_seqlen_k), + device=q.device, + dtype=torch.float32, + ) + else: + s_dmask = None + dropout_mask = None + + if config is None: + config = _get_config(enable_dropout, q.dtype) + + """ + config = { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 1, + } + # Dropout significantly increases VGPR usage so use small tiles + if enable_dropout or q.dtype == torch.float32: + config = { + "BLOCK_M": 32, + "BLOCK_N": 32, + "waves_per_eu": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1, + } + """ + + grid = lambda META: ( # noqa: E731 + batch * num_q_heads * triton.cdiv(seqlen_q, META["BLOCK_M"]), + ) + + _attn_fwd[grid]( + q, + k, + v, + descale_q, + descale_k, + descale_v, + o, + alibi_slopes, + s_dmask, + dropout_mask, + softmax_lse, + *q_strides, + *k_strides, + *v_strides, + descale_q.stride(0) if descale_q is not None else 0, + descale_k.stride(0) if descale_k is not None else 0, + descale_v.stride(0) if descale_v is not None else 0, + *o_strides, + alibi_slopes.stride(0) if alibi_slopes is not None else 0, + alibi_slopes.stride(1) if alibi_slopes is not None else 0, + s_dmask.stride(0) if s_dmask is not None else 0, + s_dmask.stride(1) if s_dmask is not None else 0, + s_dmask.stride(2) if s_dmask is not None else 0, + s_dmask.stride(3) if s_dmask is not None else 0, + stride_lse_z if softmax_lse is not None else 0, + stride_lse_h if softmax_lse is not None else 0, + stride_lse_m if softmax_lse is not None else 0, + softmax_scale, + cu_seqlens_q, + cu_seqlens_k, + dropout_p, + philox_seed, + philox_offset, + SEQLEN_Q=max_seqlen_q, + SEQLEN_K=max_seqlen_k, + IS_CAUSAL=causal, + NUM_Q_HEADS=num_q_heads, + NUM_K_HEADS=num_k_heads, + BLOCK_DMODEL=head_sz, + BLOCK_DMODEL_POW2=BLOCK_DMODEL_POW2, + RETURN_SCORES=return_softmax, + ENABLE_DROPOUT=enable_dropout, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + VARLEN=is_varlen, + BATCH=batch, + NUM_XCD=8, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + **config, + ) + + return o, softmax_lse, s_dmask, philox_seed, philox_offset + + +class _FlashAttnFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_softmax, + is_grad_enabled, + config=None, + ): + is_grad = is_grad_enabled and any(x.requires_grad for x in [q, k, v]) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + head_size_og = q.size(3) + if head_size_og % 8 != 0: + q = torch.nn.functional.pad(q, [0, 8 - head_size_og % 8]) + k = torch.nn.functional.pad(k, [0, 8 - head_size_og % 8]) + v = torch.nn.functional.pad(v, [0, 8 - head_size_og % 8]) + out_padded, softmax_lse, S_dmask, philox_seed, philox_offset = ( + _flash_attn_forward( + q, + k, + v, + dropout_p, + softmax_scale, + causal=causal, + window_size_left=int(window_size[0]), + window_size_right=int(window_size[1]), + bias=bias, + alibi_slopes=alibi_slopes, + return_lse=return_lse, + return_softmax=return_softmax and dropout_p > 0, + max_seqlen_q=q.shape[1], + max_seqlen_k=k.shape[1], + config=config, + ) + ) + + if is_grad: + ctx.save_for_backward(q, k, v, out_padded, softmax_lse) + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.bias = bias + ctx.window_size = window_size + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + + out = out_padded[..., :head_size_og] + result = [out] + if return_lse: + result.append(softmax_lse) + if return_softmax: + result.append(S_dmask) + + return result[0] if len(result) == 1 else tuple(result) + + @staticmethod + def backward(ctx, do, *args): + q, k, v, out, softmax_lse = ctx.saved_tensors + bias = ctx.bias + dbias = torch.empty_like(bias) if bias is not None else None + dq, dk, dv = torch.zeros_like(q), torch.empty_like(k), torch.empty_like(v) + head_size_v_og = do.size(3) + do_padded = do + if head_size_v_og % 8 != 0: + do_padded = torch.nn.functional.pad(do, [0, 8 - head_size_v_og % 8]) + + print("Using fused backward kernel:", _USE_FUSED_BWD_KERNEL) + + if _USE_FUSED_BWD_KERNEL: + flash_attn_fused_backward( + do_padded, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + dbias, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + None, + None, + max_seqlen_q=q.shape[1], + max_seqlen_k=k.shape[1], + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + else: + flash_attn_onekernel_backward( + do_padded, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + dbias, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + None, + None, + max_seqlen_q=q.shape[1], + max_seqlen_k=k.shape[1], + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] + dv = dv[..., : v.shape[-1]] + return ( + dq, + dk, + dv, + None, + None, + None, + None, + dbias, + None, + None, + None, + None, + None, + None, + ) + + +def flash_attn_func( + q, + k, + v, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + bias=None, + alibi_slopes=None, + deterministic=True, + return_lse=False, + return_attn_probs=False, + config: Optional[dict[str, any]] = None, +): + """dropout_p should be set to 0.0 during evaluation + Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (batch_size, seqlen, nheads, headdim) + k: (batch_size, seqlen, nheads_k, headdim) + v: (batch_size, seqlen, nheads_k, headdim) + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim_q). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + bias: (seqlen_q, seqlen_k) + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (batch_size, seqlen, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + + return _FlashAttnFunc.apply( + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_attn_probs, + torch.is_grad_enabled(), + config, + ) + + +class _FlashAttnFP8Func(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + alibi_slopes, + deterministic, + return_lse, + return_softmax, + is_grad_enabled, + config=None, + ): + is_grad = is_grad_enabled and any(x.requires_grad for x in [q, k, v]) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + head_size_og = q.size(3) + if head_size_og % 8 != 0: + q = torch.nn.functional.pad(q, [0, 8 - head_size_og % 8]) + k = torch.nn.functional.pad(k, [0, 8 - head_size_og % 8]) + v = torch.nn.functional.pad(v, [0, 8 - head_size_og % 8]) + + # cast input to fp8 + fp8_dtype = arch_info.get_fp8_e4m3_dtype() + q_fp8, descale_q = _cast_to_fp8(q, fp8_dtype, "bshd") + k_fp8, descale_k = _cast_to_fp8(k, fp8_dtype, "bshd") + v_fp8, descale_v = _cast_to_fp8(v, fp8_dtype, "bshd") + + out_padded, softmax_lse, S_dmask, philox_seed, philox_offset = ( + _flash_attn_forward( + q_fp8, + k_fp8, + v_fp8, + dropout_p, + softmax_scale, + causal=causal, + window_size_left=int(window_size[0]), + window_size_right=int(window_size[1]), + bias=None, + alibi_slopes=alibi_slopes, + return_lse=return_lse, + return_softmax=return_softmax and dropout_p > 0, + max_seqlen_q=q.shape[1], + max_seqlen_k=k.shape[1], + cu_seqlens_q=None, + cu_seqlens_k=None, + descale_q=descale_q, + descale_k=descale_k, + descale_v=descale_v, + config=config, + ) + ) + + if is_grad: + ctx.save_for_backward( + q_fp8, + k_fp8, + v_fp8, + out_padded, + softmax_lse, + descale_q, + descale_k, + descale_v, + ) + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.alibi_slopes = alibi_slopes + + out = out_padded[..., :head_size_og] + result = [out] + if return_lse: + result.append(softmax_lse) + if return_softmax: + result.append(S_dmask) + + return result[0] if len(result) == 1 else tuple(result) + + @staticmethod + def backward(ctx, do, *args): + q_fp8, k_fp8, v_fp8, out, softmax_lse, descale_q, descale_k, descale_v = ( + ctx.saved_tensors + ) + dq, dk, dv = ( + torch.zeros_like(q_fp8, dtype=torch.float32), + torch.zeros_like(k_fp8, dtype=torch.float32), + torch.zeros_like(v_fp8, dtype=torch.float32), + ) + head_size_v_og = do.size(3) + do_padded = do + if head_size_v_og % 8 != 0: + do_padded = torch.nn.functional.pad(do, [0, 8 - head_size_v_og % 8]) + + fp8_dtype = arch_info.get_fp8_e4m3_dtype() + do_padded_fp8, descale_do = _cast_to_fp8(do_padded, fp8_dtype, "bshd") + if _USE_FUSED_BWD_KERNEL: + flash_attn_fused_backward( + do_padded_fp8, + q_fp8, + k_fp8, + v_fp8, + out, + softmax_lse, + dq, + dk, + dv, + None, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + None, + None, + max_seqlen_q=q_fp8.shape[1], + max_seqlen_k=k_fp8.shape[1], + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + descale_q=descale_q, + descale_k=descale_k, + descale_v=descale_v, + descale_do=descale_do, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + else: + flash_attn_onekernel_backward( + do_padded_fp8, + q_fp8, + k_fp8, + v_fp8, + out, + softmax_lse, + dq, + dk, + dv, + None, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + None, + None, + max_seqlen_q=q_fp8.shape[1], + max_seqlen_k=k_fp8.shape[1], + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + descale_q=descale_q, + descale_k=descale_k, + descale_v=descale_v, + descale_do=descale_do, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + + # dq = dq[..., : q_fp8.shape[-1]] # We could have padded the head dimension + # dk = dk[..., : k_fp8.shape[-1]] + # dv = dv[..., : v_fp8.shape[-1]] + return ( + dq, + dk, + dv, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + + +def flash_attn_fp8_func( + q, + k, + v, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + alibi_slopes=None, + deterministic=False, + return_lse=False, + return_attn_probs=False, + config: Optional[dict[str, any]] = None, +): + return _FlashAttnFP8Func.apply( + q, + k, + v, + dropout_p, + softmax_scale, + causal, + window_size, + alibi_slopes, + deterministic, + return_lse, + return_attn_probs, + torch.is_grad_enabled(), + config, + ) + + +class _FlashAttnVarlenFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_softmax, + block_table, + out, + is_grad_enabled, + config=None, + ): + is_grad = is_grad_enabled and any(x.requires_grad for x in [q, k, v]) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + head_size_og = q.size(2) + if head_size_og % 8 != 0: + q = torch.nn.functional.pad(q, [0, 8 - head_size_og % 8]) + k = torch.nn.functional.pad(k, [0, 8 - head_size_og % 8]) + v = torch.nn.functional.pad(v, [0, 8 - head_size_og % 8]) + out_padded, softmax_lse, S_dmask, philox_seed, philox_offset = ( + _flash_attn_forward( + q, + k, + v, + dropout_p, + softmax_scale, + causal=causal, + window_size_left=int(window_size[0]), + window_size_right=int(window_size[1]), + bias=bias, + alibi_slopes=alibi_slopes, + return_lse=return_lse, + return_softmax=return_softmax and dropout_p > 0.0, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + config=config, + ) + ) + if is_grad: + ctx.save_for_backward( + q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k + ) + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.bias = bias + ctx.alibi_slopes = alibi_slopes + out = out_padded[..., :head_size_og] + + result = [out] + if return_lse: + result.append(softmax_lse) + if return_softmax: + result.append(S_dmask) + + return result[0] if len(result) == 1 else tuple(result) + + @staticmethod + def backward(ctx, do, *args): + q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors + dq, dk, dv = torch.zeros_like(q), torch.empty_like(k), torch.empty_like(v) + bias = ctx.bias + dbias = torch.empty_like(bias) if bias is not None else None + head_size_og = do.size(2) + do_padded = do + if head_size_og % 8 != 0: + do_padded = torch.nn.functional.pad(do, [0, 8 - head_size_og % 8]) + + if _USE_FUSED_BWD_KERNEL: + flash_attn_fused_backward( + do_padded, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + dbias, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q=ctx.max_seqlen_q, + max_seqlen_k=ctx.max_seqlen_k, + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + else: + flash_attn_onekernel_backward( + do_padded, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + dbias, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q=ctx.max_seqlen_q, + max_seqlen_k=ctx.max_seqlen_k, + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + + dq = dq[..., : q.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k.shape[-1]] + dv = dv[..., : v.shape[-1]] + return ( + dq, + dk, + dv, + None, + None, + None, + None, + None, + None, + None, + None, + dbias, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) + + +def flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + bias=None, + alibi_slopes=None, + deterministic=False, + return_lse=False, + return_attn_probs=False, + block_table=None, + out=None, + config: Optional[dict[str, any]] = None, +): + """dropout_p should be set to 0.0 during evaluation + Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch. + k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch. + v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch. + cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_q: int. Maximum query sequence length in the batch. + max_seqlen_k: int. Maximum key sequence length in the batch. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + bias: (seqlen_q, seqlen_k) + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (total, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return _FlashAttnVarlenFunc.apply( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + bias, + alibi_slopes, + deterministic, + return_lse, + return_attn_probs, + block_table, + out, + torch.is_grad_enabled(), + config, + ) + + +class _FlashAttnVarlenFP8Func(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + alibi_slopes, + deterministic, + return_lse, + return_softmax, + block_table, + is_grad_enabled, + config=None, + ): + is_grad = is_grad_enabled and any(x.requires_grad for x in [q, k, v]) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + head_size_og = q.size(2) + if head_size_og % 8 != 0: + q = torch.nn.functional.pad(q, [0, 8 - head_size_og % 8]) + k = torch.nn.functional.pad(k, [0, 8 - head_size_og % 8]) + v = torch.nn.functional.pad(v, [0, 8 - head_size_og % 8]) + + # cast input to fp8 + fp8_dtype = arch_info.get_fp8_e4m3_dtype() + q_fp8, descale_q = _cast_varlen_to_fp8(q, fp8_dtype, cu_seqlens=cu_seqlens_q) + k_fp8, descale_k = _cast_varlen_to_fp8(k, fp8_dtype, cu_seqlens=cu_seqlens_k) + v_fp8, descale_v = _cast_varlen_to_fp8(v, fp8_dtype, cu_seqlens=cu_seqlens_k) + + out_padded, softmax_lse, S_dmask, philox_seed, philox_offset = ( + _flash_attn_forward( + q_fp8, + k_fp8, + v_fp8, + dropout_p, + softmax_scale, + causal=causal, + window_size_left=int(window_size[0]), + window_size_right=int(window_size[1]), + bias=None, + alibi_slopes=alibi_slopes, + return_lse=return_lse, + return_softmax=return_softmax and dropout_p > 0, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + descale_q=descale_q, + descale_k=descale_k, + descale_v=descale_v, + config=config, + ) + ) + if is_grad: + ctx.save_for_backward( + q_fp8, + k_fp8, + v_fp8, + out_padded, + softmax_lse, + cu_seqlens_q, + cu_seqlens_k, + descale_q, + descale_k, + descale_v, + ) + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.dropout_p = dropout_p + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.window_size = window_size + ctx.alibi_slopes = alibi_slopes + + out = out_padded[..., :head_size_og] + result = [out] + if return_lse: + result.append(softmax_lse) + if return_softmax: + result.append(S_dmask) + + return result[0] if len(result) == 1 else tuple(result) + + @staticmethod + def backward(ctx, do, *args): + ( + q_fp8, + k_fp8, + v_fp8, + out, + softmax_lse, + cu_seqlens_q, + cu_seqlens_k, + descale_q, + descale_k, + descale_v, + ) = ctx.saved_tensors + dq, dk, dv = ( + torch.zeros_like(q_fp8, dtype=torch.float32), + torch.zeros_like(k_fp8, dtype=torch.float32), + torch.zeros_like(v_fp8, dtype=torch.float32), + ) + head_size_v_og = do.size(3) + do_padded = do + if head_size_v_og % 8 != 0: + do_padded = torch.nn.functional.pad(do, [0, 8 - head_size_v_og % 8]) + + fp8_dtype = arch_info.get_fp8_e4m3_dtype() + do_padded_fp8, descale_do = _cast_varlen_to_fp8( + do_padded, fp8_dtype, "thd", cu_seqlens_q + ) + if _USE_FUSED_BWD_KERNEL: + flash_attn_fused_backward( + do_padded_fp8, + q_fp8, + k_fp8, + v_fp8, + out, + softmax_lse, + dq, + dk, + dv, + None, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q=ctx.max_seqlen_q, + max_seqlen_k=ctx.max_seqlen_k, + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + descale_q=descale_q, + descale_k=descale_k, + descale_v=descale_v, + descale_do=descale_do, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + else: + flash_attn_onekernel_backward( + do_padded_fp8, + q_fp8, + k_fp8, + v_fp8, + out, + softmax_lse, + dq, + dk, + dv, + None, + ctx.softmax_scale, + ctx.alibi_slopes, + ctx.causal, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q=ctx.max_seqlen_q, + max_seqlen_k=ctx.max_seqlen_k, + dropout_p=ctx.dropout_p, + philox_seed=ctx.philox_seed, + philox_offset=ctx.philox_offset, + descale_q=descale_q, + descale_k=descale_k, + descale_v=descale_v, + descale_do=descale_do, + USE_INT64_STRIDES=_USE_INT64_STRIDES, + ) + dq = dq[..., : q_fp8.shape[-1]] # We could have padded the head dimension + dk = dk[..., : k_fp8.shape[-1]] + dv = dv[..., : v_fp8.shape[-1]] + return dq, dk, dv, None, None, None, None, None, None, None, None, None, None + + +def flash_attn_varlen_fp8_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p=0.0, + softmax_scale=None, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + alibi_slopes=None, + deterministic=False, + return_lse=False, + return_attn_probs=False, + block_table=None, + config: Optional[dict[str, any]] = None, +): + return _FlashAttnVarlenFP8Func.apply( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + window_size, + alibi_slopes, + deterministic, + return_lse, + return_attn_probs, + block_table, + torch.is_grad_enabled(), + config, + ) diff --git a/aiter/ops/triton/mha_fused_bwd.py b/aiter/ops/triton/mha_fused_bwd.py new file mode 100644 index 0000000000000000000000000000000000000000..b3037ac05e9a482292214db433bb76df265004e4 --- /dev/null +++ b/aiter/ops/triton/mha_fused_bwd.py @@ -0,0 +1,1271 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional, Dict +import functools +import json +import torch +import triton +import triton.language as tl + +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter.ops.triton.utils.pid_preprocessing import remap_xcd +from aiter.ops.triton.utils.mha_kernel_utils import ( + _compute_fp8_scaling_factors, + _is_fp8, +) + + +# This function computes delta given output Out and gradient DO +# Here is the I/O shape: +# Out: (batch, nhead_q, max_seqlens_q, headDim) +# DO: (batch, nhead_q, max_seqlens_q, headDim) +# Delta: (batch, nheads_q, max_seqlens_q), same as softmax_lse defined at +@triton.jit +def _bwd_preprocess( + o_ptr, + do_ptr, # noqa: E741 + delta_ptr, + stride_o_b, + stride_o_h, + stride_o_m, + stride_o_k, + stride_delta_b, + stride_delta_h, + stride_delta_m, + stride_descale_do_z, + cu_seqlens_q, + max_seqlen_q, + descale_do_ptr, + BLOCK_M: tl.constexpr, + BLOCK_D_MODEL: tl.constexpr, + BLOCK_D_MODEL_POW2: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_FP8: tl.constexpr, +): + pid_m = tl.program_id(0) # seqlen + bid = tl.program_id(1) # batch + hid = tl.program_id(2) # head + + # Handle varlen + q_start = 0 + seqlen_q = max_seqlen_q + if IS_VARLEN: + q_start = tl.load(cu_seqlens_q + bid) + q_end = tl.load(cu_seqlens_q + bid + 1) + seqlen_q = q_end - q_start + else: + q_start = 0 + seqlen_q = max_seqlen_q + + # Compute offsets + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_D_MODEL_POW2) + + # Offset O/DO by batch, head and q_start + offs = ( + bid * stride_o_b + + hid * stride_o_h + + q_start * stride_o_m + + offs_m[:, None] * stride_o_m + + offs_k[None, :] * stride_o_k + ) + + # create masks + mask_m = offs_m < seqlen_q + mask = mask_m[:, None] + PADDED_HEAD: tl.constexpr = BLOCK_D_MODEL != BLOCK_D_MODEL_POW2 + if PADDED_HEAD: + mask &= offs_k[None, :] < BLOCK_D_MODEL + + # load [BLOCK_M, BLOCK_D_MODEL_POW2] + o = tl.load(o_ptr + offs, mask=mask, other=0.0) + do = tl.load(do_ptr + offs, mask=mask, other=0.0) + + # compute and write-back to delta + if IS_FP8: + descale_do = tl.load(descale_do_ptr + bid * stride_descale_do_z + hid) + + # NOTE: do is in the fp8 range and o is not in fp8 + delta = tl.sum(o.to(tl.float32) * (do.to(tl.float32) * descale_do), axis=1) + else: + delta = tl.sum(o.to(tl.float32) * do.to(tl.float32), axis=1) + + offs_delta = ( + bid * stride_delta_b + + hid * stride_delta_h + + q_start * stride_delta_m + + offs_m * stride_delta_m + ) + tl.store(delta_ptr + offs_delta, delta, mask=mask_m) + + +@triton.jit +def _bwd_dkdvdq_inner( + dk, + dv, + Q, + k, + v, + DO, + DQ, + M, + D, + sm_scale, + stride_q_m, + stride_q_k, + stride_dq_m, + stride_dq_k, + stride_do_m, + stride_do_k, + stride_dropout_m, + stride_dropout_n, + stride_deltam, + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + seqlen_q, + seqlen_k, + start_n, + start_m, + num_steps, + descale_q, + descale_k, + descale_v, + descale_do, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_D_MODEL: tl.constexpr, + BLOCK_D_MODEL_POW2: tl.constexpr, + MASK: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + workgroup_id, +): + tl.assume(stride_q_m >= 0) + tl.assume(stride_q_k >= 0) + tl.assume(stride_dq_m >= 0) + tl.assume(stride_dq_k >= 0) + tl.assume(stride_do_m >= 0) + tl.assume(stride_do_k >= 0) + tl.assume(stride_deltam >= 0) + + PADDED_HEAD: tl.constexpr = BLOCK_D_MODEL != BLOCK_D_MODEL_POW2 + delta_qk = seqlen_q - seqlen_k + offs_m = start_m + tl.arange(0, BLOCK_M) + offs_n = start_n + tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_D_MODEL_POW2) + + # mask to make sure not OOB of seqlen_q + mask_n = offs_n < seqlen_k + + qT_ptrs_start = ( + Q + offs_m[None, :] * stride_q_m + offs_k[:, None] * stride_q_k + ) # [BLOCK_D_MODEL_POW2, BLOCK_M] + dq_ptrs_start = ( + DQ + offs_m[:, None] * stride_dq_m + offs_k[None, :] * stride_dq_k + ) # [BLOCK_M, BLOCK_D_MODEL_POW2] + + do_ptrs_start = DO + offs_m[:, None] * stride_do_m + offs_k[None, :] * stride_do_k + curr_m = start_m + step_m = BLOCK_M + curr_philox_offset = batch_philox_offset + + # Iterate over blocks(BLOCK_M size) of Q while calculating + # a fixed block(BLOCK_N) of dk and dv. Note, during backward + # pass P has to be recomputed. However, this kernel computes + # dV and dK, so we compute we need P^T and S^T. See backward pass + # equations + # + # From Flash Attention Paper: + # ForwardPass: S = QkT, P=softmax(S), O=PV + # + # BackwardPass equations + # dV = P^TdO + # dP = dOV^T + # dS = dsoftmax(dP) + # dQ = dSK + # dK = QdS^T + + for iter in range(num_steps): + # Permute the iteration order to reduce the probability that concurrent workgroups (that share the same q head idx and batch idx) are at the same iteration + blk_idx = (iter + workgroup_id) % num_steps + + curr_m = start_m + blk_idx * step_m + qT_ptrs = qT_ptrs_start + blk_idx * step_m * stride_q_m + dq_ptrs = dq_ptrs_start + blk_idx * step_m * stride_dq_m + do_ptrs = do_ptrs_start + blk_idx * step_m * stride_do_m + + offs_m = curr_m + tl.arange(0, BLOCK_M) + mask_m = offs_m < seqlen_q + mask_qT = mask_m[None, :] + mask_do = mask_m[:, None] + mask_nm = mask_n[:, None] & (offs_m[None, :] < seqlen_q) + + if PADDED_HEAD: + mask_qT &= offs_k[:, None] < BLOCK_D_MODEL + mask_do &= offs_k[None, :] < BLOCK_D_MODEL + + # load qT + qT = tl.load(qT_ptrs, mask=mask_qT, other=0.0) + + # dropout + if ENABLE_DROPOUT: + # NOTE: dropout is transposed because it is used to mask pT + philox_offs = ( + curr_philox_offset + + offs_m[None, :] * stride_dropout_m + + offs_n[:, None] * stride_dropout_n + ) + rand_vals = tl.rand(philox_seed, philox_offs) + dropout_mask = rand_vals > dropout_p + dropout_scale = 1.0 / (1 - dropout_p) + + # Load M + m = tl.load(M + offs_m * stride_deltam, mask=mask_m, other=0.0) + + # Compute qkT + if IS_FP8: + qkT = tl.dot(k, qT) * descale_q * descale_k + else: + qkT = tl.dot(k, qT) + + # Compute pT(use m and also apply sm_scale) + pT = tl.math.exp(qkT * sm_scale - m[None, :]) + + if MASK: + causal_mask = (offs_m[None, :] - delta_qk) >= (offs_n[:, None]) + mask = causal_mask & mask_nm + pT = tl.where(mask, pT, 0.0) + + # load DO + do = tl.load(do_ptrs, mask=mask_do, other=0.0) + + # dV + if ENABLE_DROPOUT: + pT_dropout = tl.where(dropout_mask, pT, 0.0) * dropout_scale + if IS_FP8: + scale_p_dropout, descale_p_dropout = _compute_fp8_scaling_factors( + pT_dropout, FP8_MAX + ) + dv += ( + tl.dot((pT_dropout * scale_p_dropout).to(do.type.element_ty), do) + * descale_p_dropout + * descale_do + ) + else: + dv += tl.dot(pT_dropout.to(do.type.element_ty), do) + else: + if IS_FP8: + scale_pT, descale_pT = _compute_fp8_scaling_factors(pT, FP8_MAX) + dv += ( + tl.dot((pT * scale_pT).to(do.type.element_ty), do) + * descale_pT + * descale_do + ) + else: + dv += tl.dot(pT.to(do.type.element_ty), do) + + # Load delta + Di = tl.load(D + offs_m * stride_deltam, mask=mask_m) + + # Compute dP and dS + if IS_FP8: + dpT = tl.dot(v, tl.trans(do)) * descale_v * descale_do + else: + dpT = tl.dot(v, tl.trans(do)) + + if ENABLE_DROPOUT: + dpT = tl.where(dropout_mask, dpT, 0.0) * dropout_scale + + delta_i = Di[None, :] + dsT = pT * (dpT - delta_i) + + # compute dk + if IS_FP8: + scale_dsT, descale_dsT = _compute_fp8_scaling_factors(dsT, FP8_MAX) + dk += ( + tl.dot((dsT * scale_dsT).to(qT.type.element_ty), tl.trans(qT)) + * descale_dsT + * descale_q + ) + else: + dk += tl.dot(dsT.to(qT.type.element_ty), tl.trans(qT)) + + # We can compute the dq_partial here and do a atomic add to the correct memory location + # NOTE: Possible problems with the atomic add: contention, is inside a loop which has achieved bad perf before + # (BLOCK_M, BLOCK_N) x (BLOCK_N, D) + if IS_FP8: + dq_partial = ( + tl.dot((dsT * scale_dsT).to(k.dtype).T, k) * descale_dsT * descale_k + ) + else: + dq_partial = tl.dot(dsT.to(k.dtype).T, k) + tl.atomic_add( + dq_ptrs, + dq_partial * sm_scale, + mask=mask_m[:, None] & (offs_k[None, :] < BLOCK_D_MODEL), + sem="relaxed", + ) + + return dk, dv + + +@triton.jit +def _bwd_kernel_dkdvdq_causal( + q_ptr, + k_ptr, + v_ptr, + sm_scale, + do_ptr, + dk_ptr, + dv_ptr, + dq_ptr, + m_ptr, + delta_ptr, + stride_q_b_in, + stride_q_h_in, + stride_q_m_in, + stride_q_k_in, + stride_k_b_in, + stride_k_h_in, + stride_k_n_in, + stride_k_k_in, + stride_v_b_in, + stride_v_h_in, + stride_v_n_in, + stride_v_k_in, + stride_dk_b_in, + stride_dk_h_in, + stride_dk_n_in, + stride_dk_k_in, + stride_dq_b_in, + stride_dq_h_in, + stride_dq_m_in, + stride_dq_k_in, + stride_delta_b_in, + stride_delta_h_in, + stride_delta_m_in, + stride_do_b_in, + stride_do_h_in, + stride_do_m_in, + stride_do_k_in, + stride_dropout_b_in, + stride_dropout_h_in, + stride_dropout_m_in, + stride_dropout_n_in, + stride_descale_q_z_in, + stride_descale_k_z_in, + stride_descale_v_z_in, + stride_descale_do_z_in, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_mask, + dropout_p, + philox_seed, + philox_offset_base_in, + descale_q_ptr, + descale_k_ptr, + descale_v_ptr, + descale_do_ptr, + NUM_Q_HEADS: tl.constexpr, + NUM_K_HEADS: tl.constexpr, + BATCH, + NUM_K_PIDS, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLK_SLICE_FACTOR: tl.constexpr, + BLOCK_D_MODEL: tl.constexpr, + BLOCK_D_MODEL_POW2: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + NUM_SMS: tl.constexpr, + USE_INT64_STRIDES: tl.constexpr, +): + if USE_INT64_STRIDES: + stride_q_b = tl.cast(stride_q_b_in, tl.int64) + stride_q_h = tl.cast(stride_q_h_in, tl.int64) + stride_q_m = tl.cast(stride_q_m_in, tl.int64) + stride_q_k = tl.cast(stride_q_k_in, tl.int64) + stride_k_b = tl.cast(stride_k_b_in, tl.int64) + stride_k_h = tl.cast(stride_k_h_in, tl.int64) + stride_k_n = tl.cast(stride_k_n_in, tl.int64) + stride_k_k = tl.cast(stride_k_k_in, tl.int64) + stride_v_b = tl.cast(stride_v_b_in, tl.int64) + stride_v_h = tl.cast(stride_v_h_in, tl.int64) + stride_v_n = tl.cast(stride_v_n_in, tl.int64) + stride_v_k = tl.cast(stride_v_k_in, tl.int64) + stride_dk_b = tl.cast(stride_dk_b_in, tl.int64) + stride_dk_h = tl.cast(stride_dk_h_in, tl.int64) + stride_dk_n = tl.cast(stride_dk_n_in, tl.int64) + stride_dk_k = tl.cast(stride_dk_k_in, tl.int64) + stride_dq_b = tl.cast(stride_dq_b_in, tl.int64) + stride_dq_h = tl.cast(stride_dq_h_in, tl.int64) + stride_dq_m = tl.cast(stride_dq_m_in, tl.int64) + stride_dq_k = tl.cast(stride_dq_k_in, tl.int64) + stride_delta_b = tl.cast(stride_delta_b_in, tl.int64) + stride_delta_h = tl.cast(stride_delta_h_in, tl.int64) + stride_delta_m = tl.cast(stride_delta_m_in, tl.int64) + stride_do_b = tl.cast(stride_do_b_in, tl.int64) + stride_do_h = tl.cast(stride_do_h_in, tl.int64) + stride_do_m = tl.cast(stride_do_m_in, tl.int64) + stride_do_k = tl.cast(stride_do_k_in, tl.int64) + stride_dropout_b = tl.cast(stride_dropout_b_in, tl.int64) + stride_dropout_h = tl.cast(stride_dropout_h_in, tl.int64) + stride_dropout_m = tl.cast(stride_dropout_m_in, tl.int64) + stride_dropout_n = tl.cast(stride_dropout_n_in, tl.int64) + philox_offset_base = tl.cast(philox_offset_base_in, tl.int64) + if IS_FP8: + stride_descale_q_z = tl.cast(stride_descale_q_z_in, tl.int64) + stride_descale_k_z = tl.cast(stride_descale_k_z_in, tl.int64) + stride_descale_v_z = tl.cast(stride_descale_v_z_in, tl.int64) + stride_descale_do_z = tl.cast(stride_descale_do_z_in, tl.int64) + else: + stride_q_b = stride_q_b_in + stride_q_h = stride_q_h_in + stride_q_m = stride_q_m_in + stride_q_k = stride_q_k_in + stride_k_b = stride_k_b_in + stride_k_h = stride_k_h_in + stride_k_n = stride_k_n_in + stride_k_k = stride_k_k_in + stride_v_b = stride_v_b_in + stride_v_h = stride_v_h_in + stride_v_n = stride_v_n_in + stride_v_k = stride_v_k_in + stride_dk_b = stride_dk_b_in + stride_dk_h = stride_dk_h_in + stride_dk_n = stride_dk_n_in + stride_dk_k = stride_dk_k_in + stride_dq_b = stride_dq_b_in + stride_dq_h = stride_dq_h_in + stride_dq_m = stride_dq_m_in + stride_dq_k = stride_dq_k_in + stride_delta_b = stride_delta_b_in + stride_delta_h = stride_delta_h_in + stride_delta_m = stride_delta_m_in + stride_do_b = stride_do_b_in + stride_do_h = stride_do_h_in + stride_do_m = stride_do_m_in + stride_do_k = stride_do_k_in + stride_dropout_b = stride_dropout_b_in + stride_dropout_h = stride_dropout_h_in + stride_dropout_m = stride_dropout_m_in + stride_dropout_n = stride_dropout_n_in + philox_offset_base = philox_offset_base_in + stride_descale_q_z = stride_descale_q_z_in + stride_descale_k_z = stride_descale_k_z_in + stride_descale_v_z = stride_descale_v_z_in + stride_descale_do_z = stride_descale_do_z_in + + GROUP_SIZE = NUM_Q_HEADS // NUM_K_HEADS + wid = tl.program_id(0) # workgoup id: 0, ..., NUM_Q_PIDS * BATCH * NUM_K_HEADS - 1 + + NUM_XCD: tl.constexpr = 8 + head_q_idx = wid % NUM_Q_HEADS + head_q_idx = remap_xcd(head_q_idx, NUM_Q_HEADS, NUM_XCD) + seq_k_blk_idx = (wid // NUM_Q_HEADS) % NUM_K_PIDS + batch_idx = (wid // (NUM_K_PIDS * NUM_Q_HEADS)) % BATCH + + # In the backward we dont want concurrent workgroups to handle consecutive heads or blocks, so remap them to be far apart. + head_q_idx = (head_q_idx * 29) % NUM_Q_HEADS + # seq_k_blk_idx = (seq_k_blk_idx * 29) % NUM_K_PIDS + + head_k_idx = head_q_idx // GROUP_SIZE + + # Determine q and k start along with seqlen_q and seqlen_k + q_start = 0 + k_start = 0 + seqlen_q = max_seqlen_q + seqlen_k = max_seqlen_k + if IS_VARLEN: + q_start = tl.load(cu_seqlens_q + batch_idx) + q_end = tl.load(cu_seqlens_q + batch_idx + 1) + k_start = tl.load(cu_seqlens_k + batch_idx) + k_end = tl.load(cu_seqlens_k + batch_idx + 1) + seqlen_q = q_end - q_start + seqlen_k = k_end - k_start + + dk = tl.zeros([BLOCK_N, BLOCK_D_MODEL_POW2], dtype=tl.float32) + dv = tl.zeros([BLOCK_N, BLOCK_D_MODEL_POW2], dtype=tl.float32) + + # Figure out causal starting block since we have seqlen_q >=< seqlen_k. + # Unlike forward pass where we tile on M dim and iterate on N dim, so that + # we can skip some M blocks, in backward pass, we tile on the N dim for kv + # and iterate over the M. In this way, we cannot skip N blocks, but only to + # determine the starting M blocks to skip some initial blocks masked by + # causal. + delta_qk = seqlen_q - seqlen_k + + # q < k: some blocks will have no Masked block, other needs to re-calc + # starting position + # delta_qk is negative so flip it, only multiple of BLOCK_N can skip the + # masked op + num_blocks_skip = -delta_qk // BLOCK_N + delta_aligned = (num_blocks_skip + 1) * BLOCK_N + delta_qk + start_delta_q_lt_k = delta_aligned // BLOCK_M * BLOCK_M + if delta_qk >= 0: + start_delta = delta_qk + else: + start_delta = start_delta_q_lt_k + + start_n = seq_k_blk_idx * BLOCK_N + + offs_k = tl.arange(0, BLOCK_D_MODEL_POW2) + offs_n = start_n + tl.arange(0, BLOCK_N) + # Mask for loading K and V + mask_kv = offs_n[:, None] < seqlen_k + PADDED_HEAD: tl.constexpr = BLOCK_D_MODEL != BLOCK_D_MODEL_POW2 + if PADDED_HEAD: + mask_k = offs_k < BLOCK_D_MODEL + mask_kv &= mask_k[None, :] + + GROUP_SIZE = NUM_Q_HEADS // NUM_K_HEADS + adj_k = ( + batch_idx * stride_k_b + + head_k_idx * stride_k_h + + k_start * stride_k_n + + offs_n[:, None] * stride_k_n + + offs_k[None, :] * stride_k_k + ) + adj_v = ( + batch_idx * stride_v_b + + head_k_idx * stride_v_h + + k_start * stride_v_n + + offs_n[:, None] * stride_v_n + + offs_k[None, :] * stride_v_k + ) + # load K and V: they stay in SRAM throughout the inner loop. + k = tl.load(k_ptr + adj_k, mask=mask_kv, other=0.0) + v = tl.load(v_ptr + adj_v, mask=mask_kv, other=0.0) + + # If MQA / GQA, set the K and V head offsets appropriately. + # for head_q_idx in range(head_k_idx * GROUP_SIZE, head_k_idx * GROUP_SIZE + GROUP_SIZE): + if delta_qk >= 0: + start_m = start_n + start_delta + len_m = BLOCK_N + else: + start_m = max(start_n + delta_qk, 0) + start_m = (start_m // BLOCK_M) * BLOCK_M + # because we might shift the masked blocks up, we are deeper into + # the masked out region, so we would potentially increase the total + # steps with masked operation to get out of it + residue_m = max(start_n + delta_qk - start_m, 0) + len_m = BLOCK_N + residue_m + + # offset input and output tensor by batch and Q/K heads + adj_q = batch_idx * stride_q_b + head_q_idx * stride_q_h + q_start * stride_q_m + adj_dq = batch_idx * stride_dq_b + head_q_idx * stride_dq_h + q_start * stride_dq_m + + q_ptr_adj = q_ptr + adj_q + dq_ptr_adj = dq_ptr + adj_dq + + adj_do = batch_idx * stride_do_b + head_q_idx * stride_do_h + q_start * stride_do_m + do_ptr_adj = do_ptr + adj_do + adj_delta = ( + batch_idx * stride_delta_b + + head_q_idx * stride_delta_h + + q_start * stride_delta_m + ) + m_ptr_adj = m_ptr + adj_delta + delta_ptr_adj = delta_ptr + adj_delta + + # batch_philox_offset is the ACTUALLY dropout offset + # dropout_offset is for debug purpose and will be removed later + batch_philox_offset = 0 + dropout_offset = 0 + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset_base + + batch_idx * stride_dropout_b + + head_q_idx * stride_dropout_h + ) + dropout_offset = ( + dropout_mask + batch_idx * stride_dropout_b + head_q_idx * stride_dropout_h + ) + + MASK_BLOCK_M: tl.constexpr = BLOCK_M // BLK_SLICE_FACTOR + # bound the masked operation to q len so it does not have to wast cycles + len_m = min(len_m, seqlen_q) + num_steps = tl.cdiv(len_m, MASK_BLOCK_M) + + # when q < k, we may skip the initial masked op + if seq_k_blk_idx < num_blocks_skip: + num_steps = 0 + + if IS_FP8: + descale_q = tl.load(descale_q_ptr + batch_idx * stride_descale_q_z + head_q_idx) + descale_k = tl.load(descale_k_ptr + batch_idx * stride_descale_k_z + head_k_idx) + descale_v = tl.load(descale_v_ptr + batch_idx * stride_descale_v_z + head_k_idx) + descale_do = tl.load( + descale_do_ptr + batch_idx * stride_descale_do_z + head_q_idx + ) + else: + descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0 + + # if unaligned start_m is negative, the current N-tile has no block on the + # diagonal of causal mask, so everything have no causal mask + dk, dv = _bwd_dkdvdq_inner( + dk, + dv, # output tensors + q_ptr_adj, + k, + v, + do_ptr_adj, + dq_ptr_adj, + m_ptr_adj, + delta_ptr_adj, + sm_scale, # input tensors + stride_q_m, + stride_q_k, # strides for q + stride_dq_m, + stride_dq_k, # strides for q + stride_do_m, + stride_do_k, # strides for o + stride_dropout_m, + stride_dropout_n, # strides for dropout + stride_delta_m, + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, # + seqlen_q, + seqlen_k, # max sequence length for q and k + start_n, + start_m, + num_steps, # iteration numbers + descale_q, + descale_k, + descale_v, + descale_do, # fp8 descale factors from user + MASK_BLOCK_M, + BLOCK_N, # block dim + BLOCK_D_MODEL, + BLOCK_D_MODEL_POW2, # head dim + MASK=True, # causal masking + ENABLE_DROPOUT=ENABLE_DROPOUT, # activate dropout + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + workgroup_id=seq_k_blk_idx, + ) + + start_m += num_steps * MASK_BLOCK_M + num_steps = tl.cdiv(seqlen_q - start_m, BLOCK_M) + + dk, dv = _bwd_dkdvdq_inner( + dk, + dv, # output tensors + q_ptr_adj, + k, + v, + do_ptr_adj, + dq_ptr_adj, + m_ptr_adj, + delta_ptr_adj, + sm_scale, # input tensors + stride_q_m, + stride_q_k, # strides for q + stride_dq_m, + stride_dq_k, # strides for dq + stride_do_m, + stride_do_k, # strides for o + stride_dropout_m, + stride_dropout_n, # strides for dropout + stride_delta_m, + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, # + seqlen_q, + seqlen_k, # max sequence length for q and k + start_n, + start_m, + num_steps, # iteration numbers + descale_q, + descale_k, + descale_v, + descale_do, # fp8 descale factors from user + BLOCK_M, + BLOCK_N, # block dim + BLOCK_D_MODEL, + BLOCK_D_MODEL_POW2, # head dim + MASK=False, # causal masking + ENABLE_DROPOUT=ENABLE_DROPOUT, # activate dropout + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + workgroup_id=seq_k_blk_idx, + ) + + # Write back dV and dK. + offs_dkdv = ( + batch_idx * stride_dk_b + + head_k_idx * stride_dk_h + + k_start * stride_dk_n + + offs_n[:, None] * stride_dk_n + + offs_k[None, :] * stride_dk_k + ) + tl.atomic_add(dv_ptr + offs_dkdv, dv, mask=mask_kv, sem="relaxed") + dk *= sm_scale + tl.atomic_add(dk_ptr + offs_dkdv, dk, mask=mask_kv, sem="relaxed") + + +@triton.jit +def _bwd_kernel_dkdvdq_noncausal( + Q, + K, + V, + sm_scale, + DO, + DK, + DV, + DQ, + M, + Delta, + stride_qb_in, + stride_qh_in, + stride_qm_in, + stride_qk_in, + stride_kb_in, + stride_kh_in, + stride_kn_in, + stride_kk_in, + stride_vb_in, + stride_vh_in, + stride_vn_in, + stride_vk_in, + stride_dkb_in, + stride_dkh_in, + stride_dkn_in, + stride_dkk_in, + stride_dqb_in, + stride_dqh_in, + stride_dqm_in, + stride_dqk_in, + stride_deltab_in, + stride_deltah_in, + stride_deltam_in, + stride_dob_in, + stride_doh_in, + stride_dom_in, + stride_dok_in, + stride_dropoutb_in, + stride_dropouth_in, + stride_dropoutm_in, + stride_dropoutn_in, + stride_descale_q_z_in, + stride_descale_k_z_in, + stride_descale_v_z_in, + stride_descale_do_z_in, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_mask, + dropout_p, + philox_seed, + philox_offset, + descale_q_ptr, + descale_k_ptr, + descale_v_ptr, + descale_do_ptr, + NUM_Q_HEADS: tl.constexpr, + NUM_K_HEADS: tl.constexpr, + BATCH, + NUM_K_PIDS, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLK_SLICE_FACTOR: tl.constexpr, + BLOCK_D_MODEL: tl.constexpr, + BLOCK_D_MODEL_POW2: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + NUM_SMS: tl.constexpr, + USE_INT64_STRIDES: tl.constexpr, +): + if USE_INT64_STRIDES: + stride_qb = tl.cast(stride_qb_in, tl.int64) + stride_qh = tl.cast(stride_qh_in, tl.int64) + stride_qm = tl.cast(stride_qm_in, tl.int64) + stride_qk = tl.cast(stride_qk_in, tl.int64) + stride_kb = tl.cast(stride_kb_in, tl.int64) + stride_kh = tl.cast(stride_kh_in, tl.int64) + stride_kn = tl.cast(stride_kn_in, tl.int64) + stride_kk = tl.cast(stride_kk_in, tl.int64) + stride_vb = tl.cast(stride_vb_in, tl.int64) + stride_vh = tl.cast(stride_vh_in, tl.int64) + stride_vn = tl.cast(stride_vn_in, tl.int64) + stride_vk = tl.cast(stride_vk_in, tl.int64) + stride_dkb = tl.cast(stride_dkb_in, tl.int64) + stride_dkh = tl.cast(stride_dkh_in, tl.int64) + stride_dkn = tl.cast(stride_dkn_in, tl.int64) + stride_dkk = tl.cast(stride_dkk_in, tl.int64) + stride_dqb = tl.cast(stride_dqb_in, tl.int64) + stride_dqh = tl.cast(stride_dqh_in, tl.int64) + stride_dqm = tl.cast(stride_dqm_in, tl.int64) + stride_dqk = tl.cast(stride_dqk_in, tl.int64) + stride_deltab = tl.cast(stride_deltab_in, tl.int64) + stride_deltah = tl.cast(stride_deltah_in, tl.int64) + stride_deltam = tl.cast(stride_deltam_in, tl.int64) + stride_dob = tl.cast(stride_dob_in, tl.int64) + stride_doh = tl.cast(stride_doh_in, tl.int64) + stride_dom = tl.cast(stride_dom_in, tl.int64) + stride_dok = tl.cast(stride_dok_in, tl.int64) + stride_dropoutb = tl.cast(stride_dropoutb_in, tl.int64) + stride_dropouth = tl.cast(stride_dropouth_in, tl.int64) + stride_dropoutm = tl.cast(stride_dropoutm_in, tl.int64) + stride_dropoutn = tl.cast(stride_dropoutn_in, tl.int64) + if IS_FP8: + stride_descale_q_z = tl.cast(stride_descale_q_z_in, tl.int64) + stride_descale_k_z = tl.cast(stride_descale_k_z_in, tl.int64) + stride_descale_v_z = tl.cast(stride_descale_v_z_in, tl.int64) + stride_descale_do_z = tl.cast(stride_descale_do_z_in, tl.int64) + else: + stride_qb = stride_qb_in + stride_qh = stride_qh_in + stride_qm = stride_qm_in + stride_qk = stride_qk_in + stride_kb = stride_kb_in + stride_kh = stride_kh_in + stride_kn = stride_kn_in + stride_kk = stride_kk_in + stride_vb = stride_vb_in + stride_vh = stride_vh_in + stride_vn = stride_vn_in + stride_vk = stride_vk_in + stride_dkb = stride_dkb_in + stride_dkh = stride_dkh_in + stride_dkn = stride_dkn_in + stride_dkk = stride_dkk_in + stride_dqb = stride_dqb_in + stride_dqh = stride_dqh_in + stride_dqm = stride_dqm_in + stride_dqk = stride_dqk_in + stride_deltab = stride_deltab_in + stride_deltah = stride_deltah_in + stride_deltam = stride_deltam_in + stride_dob = stride_dob_in + stride_doh = stride_doh_in + stride_dom = stride_dom_in + stride_dok = stride_dok_in + stride_dropoutb = stride_dropoutb_in + stride_dropouth = stride_dropouth_in + stride_dropoutm = stride_dropoutm_in + stride_dropoutn = stride_dropoutn_in + stride_descale_q_z = stride_descale_q_z_in + stride_descale_k_z = stride_descale_k_z_in + stride_descale_v_z = stride_descale_v_z_in + stride_descale_do_z = stride_descale_do_z_in + + # workgroup id + wid = tl.program_id(0) # 0, ..., NUM_K_PIDS * BATCH * NUM_K_HEADS - 1 + + # Workgroups get launched first along batch dim, then in head_k dim, and then in seq k block dim + # This is in order to avoid contention for the tl.atomic_add (inside _bwd_dkdvdq_inner) that happens between workgroups that share the same batch and head_k. + bid = wid % BATCH + hkid = wid // BATCH % NUM_K_HEADS + pid = wid // (BATCH * NUM_K_HEADS) % NUM_K_PIDS + + q_start = 0 + k_start = 0 + seqlen_q = max_seqlen_q + seqlen_k = max_seqlen_k + + if IS_VARLEN: + q_start = tl.load(cu_seqlens_q + bid) + q_end = tl.load(cu_seqlens_q + bid + 1) + k_start = tl.load(cu_seqlens_k + bid) + k_end = tl.load(cu_seqlens_k + bid + 1) + seqlen_q = q_end - q_start + seqlen_k = k_end - k_start + + dk = tl.zeros([BLOCK_N, BLOCK_D_MODEL_POW2], dtype=tl.float32) + dv = tl.zeros([BLOCK_N, BLOCK_D_MODEL_POW2], dtype=tl.float32) + + start_n = pid * BLOCK_N + + offs_k = tl.arange(0, BLOCK_D_MODEL_POW2) + offs_n = start_n + tl.arange(0, BLOCK_N) + mask_kv = offs_n[:, None] < seqlen_k + PADDED_HEAD: tl.constexpr = BLOCK_D_MODEL != BLOCK_D_MODEL_POW2 + if PADDED_HEAD: + mask_kv &= offs_k < BLOCK_D_MODEL + + GROUP_SIZE = NUM_Q_HEADS // NUM_K_HEADS + adj_k = ( + bid * stride_kb + + hkid * stride_kh + + k_start * stride_kn + + offs_n[:, None] * stride_kn + + offs_k[None, :] * stride_kk + ) + adj_v = ( + bid * stride_vb + + hkid * stride_vh + + k_start * stride_vn + + offs_n[:, None] * stride_vn + + offs_k[None, :] * stride_vk + ) + + k = tl.load(K + adj_k, mask=mask_kv, other=0.0) + v = tl.load(V + adj_v, mask=mask_kv, other=0.0) + + for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE): + adj_q = bid * stride_qb + hqid * stride_qh + q_start * stride_qm + adj_dq = bid * stride_dqb + hqid * stride_dqh + q_start * stride_dqm + + Q_ptr = Q + adj_q + DQ_ptr = DQ + adj_dq + + adj_do = bid * stride_dob + hqid * stride_doh + q_start * stride_dom + DO_ptr = DO + adj_do + adj_delta = bid * stride_deltab + hqid * stride_deltah + q_start * stride_deltam + M_ptr = M + adj_delta + Delta_ptr = Delta + adj_delta + + # dropout + batch_philox_offset = 0 + dropout_offset = 0 + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset + bid * stride_dropoutb + hqid * stride_dropouth + ) + dropout_offset = ( + dropout_mask + bid * stride_dropoutb + hqid * stride_dropouth + ) + + if IS_FP8: + descale_q = tl.load(descale_q_ptr + bid * stride_descale_q_z + hqid) + descale_k = tl.load(descale_k_ptr + bid * stride_descale_k_z + hkid) + descale_v = tl.load(descale_v_ptr + bid * stride_descale_v_z + hkid) + descale_do = tl.load(descale_do_ptr + bid * stride_descale_do_z + hqid) + else: + descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0 + + start_m = 0 + num_steps = tl.cdiv(seqlen_q, BLOCK_M) + + dk, dv = _bwd_dkdvdq_inner( + dk, + dv, + Q_ptr, + k, + v, + DO_ptr, + DQ_ptr, + M_ptr, + Delta_ptr, + sm_scale, + stride_qm, + stride_qk, + stride_dqm, + stride_dqk, + stride_dom, + stride_dok, + stride_dropoutm, + stride_dropoutn, + stride_deltam, + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + seqlen_q, + seqlen_k, + start_n, + start_m, + num_steps, + descale_q, + descale_k, + descale_v, + descale_do, + BLOCK_M, + BLOCK_N, + BLOCK_D_MODEL, + BLOCK_D_MODEL_POW2, + MASK=False, + ENABLE_DROPOUT=ENABLE_DROPOUT, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + workgroup_id=wid, + ) + + adj_dkdv = ( + bid * stride_dkb + + hkid * stride_dkh + + k_start * stride_dkn + + offs_n[:, None] * stride_dkn + + offs_k[None, :] * stride_dkk + ) + tl.store(DV + adj_dkdv, dv, mask=mask_kv) + dk *= sm_scale + tl.store(DK + adj_dkdv, dk, mask=mask_kv) + + +@functools.lru_cache(maxsize=1024) +def _get_config(): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/{dev}-MHA-DEFAULT.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + return _get_config._config_dict["bkwd_fused"] + + +def flash_attn_fused_backward( + do: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + o: torch.Tensor, + softmax_lse: torch.Tensor, + dq: torch.Tensor, + dk: torch.Tensor, + dv: torch.Tensor, + dbias: torch.Tensor, + sm_scale: float, + alibi_slopes: Optional[torch.Tensor], + causal: bool, + cu_seqlens_q: Optional[torch.Tensor], + cu_seqlens_k: Optional[torch.Tensor], + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + philox_seed: Optional[int] = 0, + philox_offset: Optional[int] = 0, + descale_q: Optional[torch.Tensor] = None, + descale_k: Optional[torch.Tensor] = None, + descale_v: Optional[torch.Tensor] = None, + descale_do: Optional[torch.Tensor] = None, + USE_INT64_STRIDES: Optional[bool] = False, + config: Optional[Dict[str, any]] = None, +): + if dbias is not None: + raise ValueError("Bias is not supported yet in the Triton Backend") + + IS_FP8 = _is_fp8(q) + if IS_FP8: + FP8_MAX = torch.finfo(q.dtype).max + descale_strides = ( + descale_q.stride(0), + descale_k.stride(0), + descale_v.stride(0), + descale_do.stride(0), + ) + else: + FP8_MAX = None + stride_descale_q_z = stride_descale_k_z = stride_descale_v_z = ( + stride_descale_do_z + ) = None + descale_strides = ( + stride_descale_q_z, + stride_descale_k_z, + stride_descale_v_z, + stride_descale_do_z, + ) + + IS_VARLEN = True if cu_seqlens_q is not None else False + + # get strides and shape + if IS_VARLEN: + # Layout for q,k,v is thd ie [total tokens, num_head, head_dim] + batch, seqlen_q, num_q_heads, head_sz = ( + len(cu_seqlens_q) - 1, + max_seqlen_q, + q.shape[1], + q.shape[2], + ) + _, num_k_heads = max_seqlen_k, k.shape[1] + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + dq_strides = (0, dq.stride(1), dq.stride(0), dq.stride(2)) + dk_strides = (0, dk.stride(1), dk.stride(0), dk.stride(2)) + do_strides = (0, do.stride(1), do.stride(0), do.stride(2)) + else: + # Layout for q,k,v is bshd ie [batch, seq_len, num_head, head_dim] + batch, seqlen_q, num_q_heads, head_sz = q.shape + _, num_k_heads = k.shape[1], k.shape[2] + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) + dq_strides = (dq.stride(0), dq.stride(2), dq.stride(1), dq.stride(3)) + dk_strides = (dk.stride(0), dk.stride(2), dk.stride(1), dk.stride(3)) + do_strides = (do.stride(0), do.stride(2), do.stride(1), do.stride(3)) + + # BLOCK_D_MODEL, BLOCK_D_MODEL_POW2 + # padding for head_dim. Power of 2 or 16 + BLOCK_D_MODEL_POW2 = triton.next_power_of_2(head_sz) + BLOCK_D_MODEL_POW2 = max(BLOCK_D_MODEL_POW2, 16) + + # init delta + delta = torch.zeros_like(softmax_lse) + if IS_VARLEN: + # [total_tokens, num_q_heads, seqlen_q] + delta_strides = (0, delta.stride(1), delta.stride(0)) + else: + # [batch, num_q_heads, seqlen_q] + delta_strides = delta.stride() + + # preprocess + # compute D(delta) = rowsum(dO*O). Note, multiplication is element-wise. + if config is None: + config = _get_config() + + pre_grid = ( + triton.cdiv(max_seqlen_q, config["preprocess_kernel"]["PRE_BLOCK"]), + batch, + num_q_heads, + ) + + _bwd_preprocess[pre_grid]( + o, + do, + delta, + *o_strides, + *delta_strides, + descale_strides[3], + cu_seqlens_q, + max_seqlen_q, + descale_do, + BLOCK_M=config["preprocess_kernel"]["PRE_BLOCK"], + BLOCK_D_MODEL=head_sz, + BLOCK_D_MODEL_POW2=BLOCK_D_MODEL_POW2, + IS_VARLEN=IS_VARLEN, + IS_FP8=IS_FP8, + ) + # dropout_mask + use_dropout = dropout_p > 0.0 + if use_dropout: + dropout_mask = torch.zeros( + (batch, num_q_heads, max_seqlen_q, max_seqlen_k), + device=q.device, + dtype=torch.float32, + ) + dropout_strides = dropout_mask.stride() + else: + dropout_mask = None + dropout_strides = (0, 0, 0, 0) + + # Fuses dk,dv and dq computations into one kernel using atomics + if BLOCK_D_MODEL_POW2 > 160 or q.dtype == torch.float32: + config_dkdvdq = config["dkdvdq_kernel_N64"] + else: + config_dkdvdq = config["dkdvdq_kernel_N128"] + + num_k_pids = (max_seqlen_k + config_dkdvdq["BLOCK_N"] - 1) // config_dkdvdq[ + "BLOCK_N" + ] + NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count + if causal: + grid_dkdvdq = (batch * num_q_heads * num_k_pids,) + + _bwd_kernel_dkdvdq_causal[grid_dkdvdq]( + q, + k, + v, + sm_scale, + do, + dk, + dv, + dq, + softmax_lse, + delta, + *q_strides, + *k_strides, + *v_strides, + *dk_strides, + *dq_strides, + *delta_strides, + *do_strides, + *dropout_strides, + *descale_strides, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_mask, + dropout_p, + philox_seed, + philox_offset, + descale_q, + descale_k, + descale_v, + descale_do, + NUM_Q_HEADS=num_q_heads, + NUM_K_HEADS=num_k_heads, + BATCH=batch, + NUM_K_PIDS=num_k_pids, + BLOCK_D_MODEL=head_sz, + BLOCK_D_MODEL_POW2=BLOCK_D_MODEL_POW2, + ENABLE_DROPOUT=use_dropout, + IS_VARLEN=IS_VARLEN, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + NUM_SMS=NUM_SMS, + USE_INT64_STRIDES=USE_INT64_STRIDES, + **config_dkdvdq, + ) + else: + # in non causal inner loop over grouped q heads + grid_dkdvdq = (batch * num_k_heads * num_k_pids,) + _bwd_kernel_dkdvdq_noncausal[grid_dkdvdq]( + q, + k, + v, + sm_scale, + do, + dk, + dv, + dq, + softmax_lse, + delta, + *q_strides, + *k_strides, + *v_strides, + *dk_strides, + *dq_strides, + *delta_strides, + *do_strides, + *dropout_strides, + *descale_strides, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_mask, + dropout_p, + philox_seed, + philox_offset, + descale_q, + descale_k, + descale_v, + descale_do, + NUM_Q_HEADS=num_q_heads, + NUM_K_HEADS=num_k_heads, + BATCH=batch, + NUM_K_PIDS=num_k_pids, + BLOCK_D_MODEL=head_sz, + BLOCK_D_MODEL_POW2=BLOCK_D_MODEL_POW2, + ENABLE_DROPOUT=use_dropout, + IS_VARLEN=IS_VARLEN, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + NUM_SMS=NUM_SMS, + USE_INT64_STRIDES=USE_INT64_STRIDES, + **config_dkdvdq, + ) + + return delta diff --git a/aiter/ops/triton/mha_onekernel_bwd.py b/aiter/ops/triton/mha_onekernel_bwd.py new file mode 100644 index 0000000000000000000000000000000000000000..f7579ad81179d75b1ecc63829393daaf88030a94 --- /dev/null +++ b/aiter/ops/triton/mha_onekernel_bwd.py @@ -0,0 +1,1805 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional, Dict +import functools +import json +import torch +import triton # type: ignore +import triton.language as tl # type: ignore +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter.ops.triton.utils.mha_kernel_utils import ( + _compute_fp8_scaling_factors, + _is_fp8, +) + + +# NOTE: triton fails to import tl.constexprs so create them here for the file +DROPOUT_USE_PYTORCH = False +DROPOUT_DUMP = False + +tl_DROPOUT_USE_PYTORCH: tl.constexpr = triton.language.constexpr(DROPOUT_USE_PYTORCH) +tl_DROPOUT_DUMP: tl.constexpr = triton.language.constexpr(DROPOUT_DUMP) + + +# This function computes delta given output Out and gradient DO +# Here is the I/O shape: +# Out: (batch, nhead_q, max_seqlens_q, headDim) +# DO: (batch, nhead_q, max_seqlens_q, headDim) +# Delta: (batch, nheads_q, max_seqlens_q), same as softmax_lse defined at +@triton.jit +def _bwd_preprocess( + o_ptr, + do_ptr, # noqa: E741 + delta_ptr, + stride_o_b, + stride_o_h, + stride_o_m, + stride_o_k, + stride_delta_b, + stride_delta_h, + stride_delta_m, + stride_descale_do_z, + cu_seqlens_q, + max_seqlen_q, + descale_do_ptr, + BLOCK_M: tl.constexpr, + BLOCK_D_MODEL: tl.constexpr, + BLOCK_D_MODEL_POW2: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_FP8: tl.constexpr, +): + pid_m = tl.program_id(0) # seqlen + bid = tl.program_id(1) # batch + hid = tl.program_id(2) # head + + # Handle varlen + q_start = 0 + seqlen_q = max_seqlen_q + if IS_VARLEN: + q_start = tl.load(cu_seqlens_q + bid) + q_end = tl.load(cu_seqlens_q + bid + 1) + seqlen_q = q_end - q_start + else: + q_start = 0 + seqlen_q = max_seqlen_q + + # Compute offsets + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_k = tl.arange(0, BLOCK_D_MODEL_POW2) + + # Offset O/DO by batch, head and q_start + offs = ( + bid * stride_o_b + + hid * stride_o_h + + q_start * stride_o_m + + offs_m[:, None] * stride_o_m + + offs_k[None, :] * stride_o_k + ) + + # create masks + mask_m = offs_m < seqlen_q + mask = mask_m[:, None] + PADDED_HEAD: tl.constexpr = BLOCK_D_MODEL != BLOCK_D_MODEL_POW2 + if PADDED_HEAD: + mask &= offs_k[None, :] < BLOCK_D_MODEL + + # load [BLOCK_M, BLOCK_D_MODEL_POW2] + o = tl.load(o_ptr + offs, mask=mask, other=0.0) + do = tl.load(do_ptr + offs, mask=mask, other=0.0) + + # compute and write-back to delta + if IS_FP8: + descale_do = tl.load(descale_do_ptr + bid * stride_descale_do_z + hid) + + # NOTE: do is in the fp8 range and o is not in fp8 + delta = tl.sum(o.to(tl.float32) * (do.to(tl.float32) * descale_do), axis=1) + else: + delta = tl.sum(o.to(tl.float32) * do.to(tl.float32), axis=1) + + offs_delta = ( + bid * stride_delta_b + + hid * stride_delta_h + + q_start * stride_delta_m + + offs_m * stride_delta_m + ) + tl.store(delta_ptr + offs_delta, delta, mask=mask_m) + + +# The main inner-loop logic for computing dK and dV. +@triton.jit +def _bwd_dkdv_inner( + dk, + dv, # output + Q, + k, + v, + DO, + M, + D, + sm_scale, # input tensor + stride_qm, + stride_qk, + stride_dom, + stride_dok, + stride_dropoutm, + stride_dropoutn, + stride_deltam, + BLOCK_M: tl.constexpr, # 16 + BLOCK_N: tl.constexpr, # 128 + HEAD_DIM: tl.constexpr, # + ACTUAL_HEAD_DIM: tl.constexpr, # + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + alibi_slope, + seqlen_q, + seqlen_k, # max sequence length for q and k + # Filled in by the wrapper. + start_n, + start_m, + num_steps, # iteration numbers + descale_q, + descale_k, + descale_v, + descale_do, # fp8 descale factors from user + MASK: tl.constexpr, # causal masking, only apply to tiles on mask diagonal + ENABLE_DROPOUT: tl.constexpr, # activate dropout + USE_ALIBI: tl.constexpr, + USE_EXP2: tl.constexpr, # activate exp2 + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + DEBUG_TRITON: tl.constexpr, + DEBUG_TRITON_DETAIL: tl.constexpr, +): + # if HEAD_DIM is padded + PADDED_HEAD: tl.constexpr = ACTUAL_HEAD_DIM != HEAD_DIM + delta_qk = seqlen_q - seqlen_k + offs_m = start_m + tl.arange(0, BLOCK_M) # start_m + (0, 15) + offs_n = start_n + tl.arange(0, BLOCK_N) # start_m + (0, 127) + offs_k = tl.arange(0, HEAD_DIM) + # mask to make sure not OOB of seqlen_q + mask_n = offs_n < seqlen_k + # Q and DO are (seqlen_q, head_dim) + # qT_ptrs = (1, BLOCK_M) + (HEAD_DIM, 1), transpose of q + qT_ptrs = Q + offs_m[None, :] * stride_qm + offs_k[:, None] * stride_qk + # do_ptrs = (BLOCK_M, 1) + (1, HEAD_DIM), NOT transposed + do_ptrs = DO + offs_m[:, None] * stride_dom + offs_k[None, :] * stride_dok + # BLOCK_N must be a multiple of BLOCK_M, otherwise the code wouldn't work. + tl.static_assert(BLOCK_N % BLOCK_M == 0) + curr_m = start_m + step_m = BLOCK_M + curr_philox_offset = batch_philox_offset + curr_dropout_offset = dropout_offset + RCP_LN2: tl.constexpr = 1.4426950408889634 # = 1.0 / ln(2) + + for blk_idx in range(num_steps): + if DEBUG_TRITON: + print(f"iter {blk_idx}: curr_m = {curr_m}") # noqa: E701 + offs_m = curr_m + tl.arange(0, BLOCK_M) + # update the mask because offs_m advanced + mask_m = offs_m < seqlen_q + mask_qT = mask_m[None, :] + mask_do = mask_m[:, None] + mask_nm = mask_n[:, None] & (offs_m[None, :] < seqlen_q) + if PADDED_HEAD: + mask_qT &= offs_k[:, None] < ACTUAL_HEAD_DIM + mask_do &= offs_k[None, :] < ACTUAL_HEAD_DIM + qT = tl.load(qT_ptrs, mask=mask_qT, other=0.0) + # generate dropout mask + if ENABLE_DROPOUT: + # NOTE: dropout is transposed because it is used to mask pT + philox_offs = ( + curr_philox_offset + + offs_m[None, :] * stride_dropoutm + + offs_n[:, None] * stride_dropoutn + ) + if tl_DROPOUT_USE_PYTORCH: + dropout_offs = ( + offs_m[None, :] * stride_dropoutm + + offs_n[:, None] * stride_dropoutn + ) + dropout_mask = tl.load(curr_dropout_offset + dropout_offs, mask=mask_nm) + else: + rand_vals = tl.rand(philox_seed, philox_offs) + dropout_mask = rand_vals > dropout_p + dropout_scale = 1.0 / (1 - dropout_p) + # Load m before computing qk to reduce pipeline stall. + m = tl.load(M + offs_m * stride_deltam, mask=mask_m, other=0.0) + if IS_FP8: + qkT = tl.dot(k, qT) * descale_q * descale_k + else: + qkT = tl.dot(k, qT) + qkT_scaled = qkT * sm_scale + + if USE_ALIBI: + relative_pos_block = offs_n[:, None] + seqlen_q - seqlen_k - offs_m[None, :] + alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block) + qkT_scaled += alibi_block + + if DEBUG_TRITON_DETAIL: + if start_n == 256: + print(f"qT: {qT.shape}\n", qT) + print(f"k: {k.shape}\n", k) + print(f"qkT scaled: {qkT.shape}\n", qkT_scaled) + # TODO: remove the scaling of m later when we removed re-scaling in fwd + if USE_EXP2: + pT = tl.math.exp2(qkT_scaled * RCP_LN2 - m[None, :] * RCP_LN2) + else: + pT = tl.math.exp(qkT_scaled - m[None, :]) + + # Autoregressive masking. + if MASK: + # offset offs_m with delta_qk since the causal mask starts at + # bottom right of the (seqlen_q, seqlen_k) matrix + causal_mask = (offs_m[None, :] - delta_qk) >= offs_n[:, None] + mask = causal_mask & mask_nm + if DEBUG_TRITON_DETAIL: + if start_n == 256: + print(f"causal_mask: {causal_mask.shape}\n", causal_mask) + print( + f"qkT after causal: {qkT.shape}\n", + tl.where(causal_mask, qkT * sm_scale, 0.0), + ) + pT = tl.where(mask, pT, 0.0) + do = tl.load(do_ptrs, mask=mask_do, other=0.0) + # Compute dV. + if ENABLE_DROPOUT: + pT_dropout = tl.where(dropout_mask, pT, 0.0) * dropout_scale + if IS_FP8: + scale_p_dropout, descale_p_dropout = _compute_fp8_scaling_factors( + pT_dropout, FP8_MAX + ) + dv += ( + tl.dot((pT_dropout * scale_p_dropout).to(do.type.element_ty), do) + * descale_p_dropout + * descale_do + ) + else: + dv += tl.dot(pT_dropout.to(do.type.element_ty), do) + else: + if IS_FP8: + scale_pT, descale_pT = _compute_fp8_scaling_factors(pT, FP8_MAX) + dv += ( + tl.dot((pT * scale_pT).to(do.type.element_ty), do) + * descale_pT + * descale_do + ) + else: + dv += tl.dot(pT.to(do.type.element_ty), do) + + if DEBUG_TRITON_DETAIL: + if start_n == 256: + print(f"pT: {pT.shape}\n", pT) + # D (= delta) is pre-divided by ds_scale. + Di = tl.load(D + offs_m * stride_deltam, mask=mask_m) + # Compute dP and dS. + if IS_FP8: + dpT = tl.dot(v, tl.trans(do)) * descale_v * descale_do + else: + dpT = tl.dot(v, tl.trans(do)) + if ENABLE_DROPOUT: + dpT = tl.where(dropout_mask, dpT, 0.0) * dropout_scale + delta_i = Di[None, :] + dsT = pT * (dpT - delta_i) + if IS_FP8: + scale_dsT, descale_dsT = _compute_fp8_scaling_factors(dsT, FP8_MAX) + dk += ( + tl.dot((dsT * scale_dsT).to(qT.type.element_ty), tl.trans(qT)) + * descale_dsT + * descale_q + ) + else: + dk += tl.dot(dsT.to(qT.type.element_ty), tl.trans(qT)) + # Increment pointers. + curr_m += step_m + qT_ptrs += step_m * stride_qm + do_ptrs += step_m * stride_dom + return dk, dv + + +# the main inner-loop logic for computing dQ +@triton.jit +def _bwd_dq_inner( + dq, # output + q, + K, + V, + do, + m, + Delta, + sm_scale, # input + # shared by Q/K/V. + stride_qm, + stride_qk, + stride_kn, + stride_kk, + stride_vn, + stride_vk, + stride_dropoutm, + stride_dropoutn, # stride for dropout + stride_deltam, + seqlen_q, + seqlen_k, # + BLOCK_M2: tl.constexpr, # + BLOCK_N2: tl.constexpr, # + HEAD_DIM: tl.constexpr, + ACTUAL_HEAD_DIM: tl.constexpr, # + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + alibi_slope, + # Filled in by the wrapper. + start_m, + start_n, + end_n, + num_steps, # + descale_q, + descale_k, + descale_v, + descale_do, # fp8 descale factors from user + MASK: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + USE_ALIBI: tl.constexpr, + USE_EXP2: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + DEBUG_TRITON: tl.constexpr, + DEBUG_TRITON_DETAIL: tl.constexpr, +): + # if HEAD_DIM is padded + PADDED_HEAD: tl.constexpr = ACTUAL_HEAD_DIM != HEAD_DIM + delta_qk = seqlen_q - seqlen_k + offs_m = start_m + tl.arange(0, BLOCK_M2) + offs_n = start_n + tl.arange(0, BLOCK_N2) + offs_k = tl.arange(0, HEAD_DIM) + + # mask to make sure not OOB of seqlen_q + mask_m = offs_m < seqlen_q + + kT_ptrs = K + offs_n[None, :] * stride_kn + offs_k[:, None] * stride_kk + vT_ptrs = V + offs_n[None, :] * stride_vn + offs_k[:, None] * stride_vk + # D (= delta) is pre-divided by ds_scale. + Di = tl.load(Delta + offs_m * stride_deltam, mask=mask_m, other=0.0) + # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work. + tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0) + curr_n = start_n + step_n = BLOCK_N2 + curr_philox_offset = batch_philox_offset + curr_dropout_offset = dropout_offset + RCP_LN2: tl.constexpr = 1.4426950408889634 # = 1.0 / ln(2) + for blk_idx in range(num_steps): + if DEBUG_TRITON: + print(f"iter {blk_idx}: curr_n = {curr_n}") # noqa: E701 + offs_n = curr_n + tl.arange(0, BLOCK_N2) + # end_n is needed because the end of causal True might not be perfectly + # aligned with the end of the block + mask_n = offs_n < end_n + if DEBUG_TRITON_DETAIL: + print( + f"start_n = {start_n}, end_n = {end_n}, offs_n: {offs_n.shape}\n{offs_n}" + ) # noqa: E701 + if DEBUG_TRITON_DETAIL: + print(f"mask_n: {mask_n.shape}\n{mask_n}") # noqa: E701 + mask_kT = mask_n[None, :] + mask_mn = mask_m[:, None] & (offs_n[None, :] < end_n) + if PADDED_HEAD: + mask_kT &= offs_k[:, None] < ACTUAL_HEAD_DIM + + kT = tl.load(kT_ptrs, mask=mask_kT, other=0.0) + vT = tl.load(vT_ptrs, mask=mask_kT, other=0.0) + + if ENABLE_DROPOUT: + # NOTE: dropout is transposed because it is used to mask pT + philox_offs = ( + curr_philox_offset + + offs_m[:, None] * stride_dropoutm + + offs_n[None, :] * stride_dropoutn + ) + if tl_DROPOUT_USE_PYTORCH: + dropout_offs = ( + offs_m[:, None] * stride_dropoutm + + offs_n[None, :] * stride_dropoutn + ) + dropout_mask = tl.load(curr_dropout_offset + dropout_offs, mask=mask_mn) + else: + rand_vals = tl.rand(philox_seed, philox_offs) + dropout_mask = rand_vals > dropout_p + dropout_scale = 1 / (1 - dropout_p) + + if IS_FP8: + qk = tl.dot(q, kT) * descale_q * descale_k + else: + qk = tl.dot(q, kT) + qk_scaled = qk * sm_scale + + if USE_ALIBI: + relative_pos_block = offs_m[:, None] + seqlen_k - seqlen_q - offs_n[None, :] + alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block) + qk_scaled += alibi_block + + if DEBUG_TRITON_DETAIL: + print(f"qk scaled: {qk.shape}\n", qk_scaled) # noqa: E701 + if USE_EXP2: + p = tl.math.exp2(qk_scaled * RCP_LN2 - m * RCP_LN2) + else: + p = tl.math.exp(qk_scaled - m) + + # Autoregressive masking. + if MASK: + causal_mask = (offs_m[:, None] - delta_qk) >= offs_n[None, :] + mask = causal_mask & mask_mn + p = tl.where(mask, p, 0.0) + # Compute dP and dS. + if IS_FP8: + dp = tl.dot(do, vT) * descale_do * descale_v + else: + dp = tl.dot(do, vT) + if ENABLE_DROPOUT: + dp = tl.where(dropout_mask, dp, 0.0) * dropout_scale + delta_i = Di[:, None] + ds = p * (dp - delta_i) + # Compute dQ. + # NOTE: We need to de-scale dq in the end, because kT was pre-scaled. + if IS_FP8: + scale_ds, descale_ds = _compute_fp8_scaling_factors(ds, FP8_MAX) + dq += ( + tl.dot((ds * scale_ds).to(kT.type.element_ty), tl.trans(kT)) + * descale_ds + * descale_k + ) + else: + dq += tl.dot(ds.to(kT.type.element_ty), tl.trans(kT)) + # Increment pointers. + curr_n += step_n + kT_ptrs += step_n * stride_kn + vT_ptrs += step_n * stride_vn + return dq + + +@triton.jit +def bwd_kernel_causal( # grid = (tl.cdiv(max_seqlen_q // BLOCK_M2), batch, nheads_q) + Q, + K, + V, + sm_scale, + DO, + DQ, + DK, + DV, + M, + Delta, + stride_qb_in, + stride_qh_in, + stride_qm_in, + stride_qd_in, + stride_kb_in, + stride_kh_in, + stride_kn_in, + stride_kd_in, + stride_vb_in, + stride_vh_in, + stride_vn_in, + stride_vd_in, + stride_dqb_in, + stride_dqh_in, + stride_dqm_in, + stride_dqd_in, + stride_dkb_in, + stride_dkh_in, + stride_dkn_in, + stride_dkd_in, + stride_dvb_in, + stride_dvh_in, + stride_dvn_in, + stride_dvd_in, + stride_deltab_in, + stride_deltah_in, + stride_deltam_in, + stride_dob_in, + stride_doh_in, + stride_dom_in, + stride_dod_in, + stride_dropoutb_in, + stride_dropouth_in, + stride_dropoutm_in, + stride_dropoutn_in, + stride_descale_q_z_in, + stride_descale_k_z_in, + stride_descale_v_z_in, + stride_descale_do_z_in, + stride_az_in, + stride_ah_in, + HQ, + HK, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + Dropout_mask, + dropout_p, + philox_seed, + philox_offset_base_in, + Alibi_slopes, + Descale_q, + Descale_k, + Descale_v, + Descale_do, + BLOCK_M1: tl.constexpr, + BLOCK_N1: tl.constexpr, + BLOCK_M2: tl.constexpr, + BLOCK_N2: tl.constexpr, + BLK_SLICE_FACTOR: tl.constexpr, + HEAD_DIM: tl.constexpr, + ACTUAL_HEAD_DIM: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + IS_VARLEN: tl.constexpr, + USE_ALIBI: tl.constexpr, + USE_EXP2: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + FP8_OUTPUT: tl.constexpr, + DEBUG_TRITON: tl.constexpr, + DEBUG_TRITON_DETAIL: tl.constexpr, + USE_INT64_STRIDES: tl.constexpr, +): + if USE_INT64_STRIDES: + stride_qb = tl.cast(stride_qb_in, tl.int64) + stride_qh = tl.cast(stride_qh_in, tl.int64) + stride_qm = tl.cast(stride_qm_in, tl.int64) + stride_qd = tl.cast(stride_qd_in, tl.int64) + stride_kb = tl.cast(stride_kb_in, tl.int64) + stride_kh = tl.cast(stride_kh_in, tl.int64) + stride_kn = tl.cast(stride_kn_in, tl.int64) + stride_kd = tl.cast(stride_kd_in, tl.int64) + stride_vb = tl.cast(stride_vb_in, tl.int64) + stride_vh = tl.cast(stride_vh_in, tl.int64) + stride_vn = tl.cast(stride_vn_in, tl.int64) + stride_vd = tl.cast(stride_vd_in, tl.int64) + stride_dqb = tl.cast(stride_dqb_in, tl.int64) + stride_dqh = tl.cast(stride_dqh_in, tl.int64) + stride_dqm = tl.cast(stride_dqm_in, tl.int64) + stride_dqd = tl.cast(stride_dqd_in, tl.int64) + stride_dkb = tl.cast(stride_dkb_in, tl.int64) + stride_dkh = tl.cast(stride_dkh_in, tl.int64) + stride_dkn = tl.cast(stride_dkn_in, tl.int64) + stride_dkd = tl.cast(stride_dkd_in, tl.int64) + stride_dvb = tl.cast(stride_dvb_in, tl.int64) + stride_dvh = tl.cast(stride_dvh_in, tl.int64) + stride_dvn = tl.cast(stride_dvn_in, tl.int64) + stride_dvd = tl.cast(stride_dvd_in, tl.int64) + stride_deltab = tl.cast(stride_deltab_in, tl.int64) + stride_deltah = tl.cast(stride_deltah_in, tl.int64) + stride_deltam = tl.cast(stride_deltam_in, tl.int64) + stride_dob = tl.cast(stride_dob_in, tl.int64) + stride_doh = tl.cast(stride_doh_in, tl.int64) + stride_dom = tl.cast(stride_dom_in, tl.int64) + stride_dod = tl.cast(stride_dod_in, tl.int64) + philox_offset_base = tl.cast(philox_offset_base_in, tl.int64) + stride_dropoutb = tl.cast(stride_dropoutb_in, tl.int64) + stride_dropouth = tl.cast(stride_dropouth_in, tl.int64) + stride_dropoutm = tl.cast(stride_dropoutm_in, tl.int64) + stride_dropoutn = tl.cast(stride_dropoutn_in, tl.int64) + if IS_FP8: + stride_descale_q_z = tl.cast(stride_descale_q_z_in, tl.int64) + stride_descale_k_z = tl.cast(stride_descale_k_z_in, tl.int64) + stride_descale_v_z = tl.cast(stride_descale_v_z_in, tl.int64) + stride_descale_do_z = tl.cast(stride_descale_do_z_in, tl.int64) + stride_az = tl.cast(stride_az_in, tl.int64) + stride_ah = tl.cast(stride_ah_in, tl.int64) + else: + stride_qb = stride_qb_in + stride_qh = stride_qh_in + stride_qm = stride_qm_in + stride_qd = stride_qd_in + stride_kb = stride_kb_in + stride_kh = stride_kh_in + stride_kn = stride_kn_in + stride_kd = stride_kd_in + stride_vb = stride_vb_in + stride_vh = stride_vh_in + stride_vn = stride_vn_in + stride_vd = stride_vd_in + stride_dqb = stride_dqb_in + stride_dqh = stride_dqh_in + stride_dqm = stride_dqm_in + stride_dqd = stride_dqd_in + stride_dkb = stride_dkb_in + stride_dkh = stride_dkh_in + stride_dkn = stride_dkn_in + stride_dkd = stride_dkd_in + stride_dvb = stride_dvb_in + stride_dvh = stride_dvh_in + stride_dvn = stride_dvn_in + stride_dvd = stride_dvd_in + stride_deltab = stride_deltab_in + stride_deltah = stride_deltah_in + stride_deltam = stride_deltam_in + stride_dob = stride_dob_in + stride_doh = stride_doh_in + stride_dom = stride_dom_in + stride_dod = stride_dod_in + philox_offset_base = philox_offset_base_in + stride_dropoutb = stride_dropoutb_in + stride_dropouth = stride_dropouth_in + stride_dropoutm = stride_dropoutm_in + stride_dropoutn = stride_dropoutn_in + stride_descale_q_z = stride_descale_q_z_in + stride_descale_k_z = stride_descale_k_z_in + stride_descale_v_z = stride_descale_v_z_in + stride_descale_do_z = stride_descale_do_z_in + stride_az = stride_az_in + stride_ah = stride_ah_in + + # program ids + hkid = tl.program_id(0) + pid = tl.program_id(1) + bid = tl.program_id(2) + if DEBUG_TRITON: + print(f"\npid: {pid}, bid: {bid}, hkid: {hkid}") # noqa: E701 + # figure out varlen start and end + q_start = 0 + k_start = 0 + seqlen_q = max_seqlen_q + seqlen_k = max_seqlen_k + if IS_VARLEN: + # Compute actual sequence lengths + q_start = tl.load(cu_seqlens_q + bid) + q_end = tl.load(cu_seqlens_q + bid + 1) + k_start = tl.load(cu_seqlens_k + bid) + k_end = tl.load(cu_seqlens_k + bid + 1) + seqlen_q = q_end - q_start + seqlen_k = k_end - k_start + + delta_qk = seqlen_q - seqlen_k + if DEBUG_TRITON: + print(f"delta_qk = {delta_qk}") # noqa: E701 + PADDED_HEAD: tl.constexpr = ACTUAL_HEAD_DIM != HEAD_DIM + offs_d = tl.arange(0, HEAD_DIM) + GROUP_SIZE: tl.constexpr = HQ // HK + + # align the delta_qk + start_n = pid * BLOCK_N1 + if start_n < seqlen_k: + # This section does dk and dv + dk = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32) + dv = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32) + + # q > k: diretcly skip all the way until the start of causal block + start_delta_q_gt_k = delta_qk + # q < k: some blocks will have no Masked block, other needs to re-calc + # starting position + # delta_qk is negative so flip it, only multiple of BLOCK_N can skip the + # masked op + num_blocks_skip = -delta_qk // BLOCK_N1 + delta_aligned = (num_blocks_skip + 1) * BLOCK_N1 + delta_qk + start_delta_q_lt_k = delta_aligned // BLOCK_M1 * BLOCK_M1 + if delta_qk >= 0: + start_delta = delta_qk + if DEBUG_TRITON: + print( + f"q >= k: start_delta = delta_qk aligned to BLOCK_M = {start_delta_q_gt_k}" + ) # noqa: E701 + else: + start_delta = start_delta_q_lt_k + if DEBUG_TRITON: + print( + f"q < k: start_delta = residue btw multiple BLOCK_N and delta_qk = {delta_aligned} = aligned to BLOCK_M = {start_delta_q_lt_k}" + ) # noqa: E701 + + offs_n = start_n + tl.arange(0, BLOCK_N1) + # Mask for loading K and V + mask_kv = offs_n[:, None] < seqlen_k + if PADDED_HEAD: + mask_d = offs_d < ACTUAL_HEAD_DIM + mask_kv &= mask_d[None, :] + + # K/V tensors not changed for the group + adj_k = ( + bid * stride_kb + + hkid * stride_kh + + k_start * stride_kn + + offs_n[:, None] * stride_kn + + offs_d[None, :] * stride_kd + ) + adj_v = ( + bid * stride_vb + + hkid * stride_vh + + k_start * stride_vn + + offs_n[:, None] * stride_vn + + offs_d[None, :] * stride_vd + ) + # load K and V: they stay in SRAM throughout the inner loop. + k = tl.load(K + adj_k, mask=mask_kv, other=0.0) + v = tl.load(V + adj_v, mask=mask_kv, other=0.0) + # If MQA / GQA, set the K and V head offsets appropriately. + # hqid = hkid + for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE): + if delta_qk >= 0: + start_m = start_n + start_delta + len_m = BLOCK_N1 + else: + start_m = max(start_n + delta_qk, 0) + start_m = start_m // BLOCK_M1 * BLOCK_M1 + # because we might shift the masked blocks up, we are deeper into + # the masked out region, so we would potentially increase the total + # steps with masked operation to get out of it + residue_m = max(start_n + delta_qk - start_m, 0) + len_m = BLOCK_N1 + residue_m + if DEBUG_TRITON: + print(f"residue_m = {residue_m}") # noqa: E701 + + # offset input and output tensor by batch and Q/K heads + adj_q = bid * stride_qb + hqid * stride_qh + q_start * stride_qm + Q_ptr = Q + adj_q + adj_do = bid * stride_dob + hqid * stride_doh + q_start * stride_dom + DO_ptr = DO + adj_do + adj_delta = ( + bid * stride_deltab + hqid * stride_deltah + q_start * stride_deltam + ) + M_ptr = M + adj_delta + Delta_ptr = Delta + adj_delta + + if USE_ALIBI: + alibi_offset = bid * stride_az + hqid * stride_ah + alibi_slope = tl.load(Alibi_slopes + alibi_offset) + else: + alibi_slope = None + + # batch_philox_offset is the ACTUALLY dropout offset + # dropout_offset is for debug purpose and will be removed later + batch_philox_offset = 0 + dropout_offset = 0 + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset_base + bid * stride_dropoutb + hqid * stride_dropouth + ) + dropout_offset = ( + Dropout_mask + bid * stride_dropoutb + hqid * stride_dropouth + ) + + if IS_FP8: + descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid) + descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid) + descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid) + descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid) + else: + descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0 + + MASK_BLOCK_M1: tl.constexpr = BLOCK_M1 // BLK_SLICE_FACTOR + # bound the masked operation to q len so it does not have to wast cycles + len_m = min(len_m, seqlen_q) + num_steps = tl.cdiv(len_m, MASK_BLOCK_M1) + # when q < k, we may skip the initial masked op + if pid < num_blocks_skip: + num_steps = 0 + + # if start_m is negative, the current N-tile has no block on the + # diagonal of causal mask, so everything have no causal mask + if DEBUG_TRITON: + print( + f"Masked: start_n: {start_n}; start_m: {start_m}, num_steps: {num_steps}" + ) # noqa: E701 + dk, dv = _bwd_dkdv_inner( + dk, + dv, # output tensors + Q_ptr, + k, + v, + DO_ptr, + M_ptr, + Delta_ptr, + sm_scale, # input tensors + stride_qm, + stride_qd, # strides for q + stride_dom, + stride_dod, # strides for o + stride_dropoutm, + stride_dropoutn, # strides for dropout + stride_deltam, + MASK_BLOCK_M1, + BLOCK_N1, # block dim + HEAD_DIM, + ACTUAL_HEAD_DIM, # head dim + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + alibi_slope, + seqlen_q, + seqlen_k, # max sequence length for q and k + start_n, + start_m, + num_steps, # iteration numbers + descale_q, + descale_k, + descale_v, + descale_do, + MASK=True, # causal masking + ENABLE_DROPOUT=ENABLE_DROPOUT, # activate dropout + USE_ALIBI=USE_ALIBI, + USE_EXP2=USE_EXP2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + DEBUG_TRITON=DEBUG_TRITON, + DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL, + ) + start_m += num_steps * MASK_BLOCK_M1 + num_steps = tl.cdiv(seqlen_q - start_m, BLOCK_M1) + end_m = start_m + num_steps * BLOCK_M1 + + if DEBUG_TRITON: + print( + f"start_m after Masked step: {start_m}; num_steps: {num_steps}" + ) # noqa: E701 + if DEBUG_TRITON: + print( + f"unMasked: start_n: {start_n}, start_m: {start_m}, end_m: {end_m}, num_steps: {num_steps}" + ) # noqa: E701 + if DEBUG_TRITON: + print("unMasked") # noqa: E701 + dk, dv = _bwd_dkdv_inner( + dk, + dv, # output tensors + Q_ptr, + k, + v, + DO_ptr, + M_ptr, + Delta_ptr, + sm_scale, # input tensors + stride_qm, + stride_qd, # strides for q + stride_dom, + stride_dod, # strides for o + stride_dropoutm, + stride_dropoutn, # strides for dropout + stride_deltam, + BLOCK_M1, + BLOCK_N1, # block dim + HEAD_DIM, + ACTUAL_HEAD_DIM, # head dim + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + alibi_slope, + seqlen_q, + seqlen_k, # max sequence length for q and k + start_n, + start_m, + num_steps, # iteration numbers + descale_q, + descale_k, + descale_v, + descale_do, + MASK=False, # causal masking + ENABLE_DROPOUT=ENABLE_DROPOUT, # activate dropout + USE_ALIBI=USE_ALIBI, + USE_EXP2=USE_EXP2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + DEBUG_TRITON=DEBUG_TRITON, + DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL, + ) + # end of GQA/MQA of dkdv + # Write back dV + adj_dv = bid * stride_dvb + hkid * stride_dvh + k_start * stride_dvn + offs_dv = offs_n[:, None] * stride_dvn + offs_d[None, :] * stride_dvd + tl.store(DV + adj_dv + offs_dv, dv, mask=mask_kv) + # write back dk + adj_dk = bid * stride_dkb + hkid * stride_dkh + k_start * stride_dkn + offs_dk = offs_n[:, None] * stride_dkn + offs_d[None, :] * stride_dkd + dk *= sm_scale + tl.store(DK + adj_dk + offs_dk, dk, mask=mask_kv) + + # This part does dq + start_m = pid * BLOCK_M2 + if start_m < seqlen_q: + # seqlen_q > seqlen_k, no need to process these tile for dq + if DEBUG_TRITON: + print( + f"end_n = start_m + BLOCK_M = {start_m} + {BLOCK_M2} = {start_m + BLOCK_M2}" + ) # noqa: E701 + if start_m + BLOCK_M2 < delta_qk: + if DEBUG_TRITON: + print( + f"start_m + BLOCK_M2 = {start_m} + {BLOCK_M2} = {start_m + BLOCK_M2} < delta_qk of {delta_qk}" + ) # noqa: E701 + return + + offs_m = start_m + tl.arange(0, BLOCK_M2) + # Mask for loading K and V + mask_q = offs_m[:, None] < seqlen_q + if PADDED_HEAD: + mask_d = offs_d < ACTUAL_HEAD_DIM + mask_q &= mask_d[None, :] + offs_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd + offs_do = offs_m[:, None] * stride_dom + offs_d[None, :] * stride_dod + # NOTE: don't assume that the strides for k and v are the same! + K += bid * stride_kb + hkid * stride_kh + k_start * stride_kn + V += bid * stride_vb + hkid * stride_vh + k_start * stride_vn + + # If MQA / GQA, set the K and V head offsets appropriately. + for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE): + # seqlen_q < seqlen_k: delta_qk more kv tokens are added at the front + # for every M-tile + end_n = start_m + BLOCK_M2 - delta_qk + # clamp end_n at [0, seqlen_k] + end_n = max(min(end_n, seqlen_k), 0) + if DEBUG_TRITON: + print(f"delta_qk: {delta_qk}; end_n: {end_n}") # noqa: E701 + # offset input and output tensor by batch and Q/K heads + adj_q = bid * stride_qb + hqid * stride_qh + q_start * stride_qm + adj_do = bid * stride_dob + hqid * stride_doh + q_start * stride_dom + adj_delta = ( + bid * stride_deltab + hqid * stride_deltah + q_start * stride_deltam + ) + Delta_ptr = Delta + adj_delta + + if USE_ALIBI: + alibi_offset = bid * stride_az + hqid * stride_ah + alibi_slope = tl.load(Alibi_slopes + alibi_offset) + else: + alibi_slope = None + + # batch_philox_offset is the ACTUALLY dropout offset + # dropout_offset is for debug purpose and will be removed later + batch_philox_offset = 0 + dropout_offset = 0 + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset_base + bid * stride_dropoutb + hqid * stride_dropouth + ) + dropout_offset = ( + Dropout_mask + bid * stride_dropoutb + hqid * stride_dropouth + ) + q = tl.load(Q + adj_q + offs_q, mask=mask_q, other=0.0) + do = tl.load(DO + adj_do + offs_do, mask=mask_q, other=0.0) + m = tl.load(M + adj_delta + offs_m * stride_deltam, mask=offs_m < seqlen_q) + m = m[:, None] + + MASK_BLOCK_N2: tl.constexpr = BLOCK_N2 // BLK_SLICE_FACTOR + # start can only be 0 at minimum + start_n = max(end_n - BLOCK_M2, 0) + num_steps = tl.cdiv(end_n - start_n, MASK_BLOCK_N2) + + if IS_FP8: + descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid) + descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid) + descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid) + descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid) + else: + descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0 + + dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32) + dq = _bwd_dq_inner( + dq, + q, + K, + V, + do, + m, + Delta_ptr, + sm_scale, + stride_qm, + stride_qd, + stride_kn, + stride_kd, + stride_vn, + stride_vd, + stride_dropoutm, + stride_dropoutn, + stride_deltam, + seqlen_q, + seqlen_k, + BLOCK_M2, + MASK_BLOCK_N2, + HEAD_DIM, + ACTUAL_HEAD_DIM, + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + alibi_slope, + start_m, + start_n, + end_n, + num_steps, + descale_q, + descale_k, + descale_v, + descale_do, + MASK=True, # + ENABLE_DROPOUT=ENABLE_DROPOUT, + USE_ALIBI=USE_ALIBI, + USE_EXP2=USE_EXP2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + DEBUG_TRITON=DEBUG_TRITON, + DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL, + ) + end_n -= num_steps * MASK_BLOCK_N2 + num_steps = tl.cdiv(end_n, BLOCK_N2) + start_n = max(end_n - num_steps * BLOCK_N2, 0) + if DEBUG_TRITON: + print( + f"unMasked: start_m: {start_m}, start_n: {start_n}, end_n: {end_n}, num_steps: {num_steps}" + ) # noqa: E701 + dq = _bwd_dq_inner( + dq, + q, + K, + V, + do, + m, + Delta_ptr, + sm_scale, + stride_qm, + stride_qd, + stride_kn, + stride_kd, + stride_vn, + stride_vd, + stride_dropoutm, + stride_dropoutn, + stride_deltam, + seqlen_q, + seqlen_k, + BLOCK_M2, + BLOCK_N2, + HEAD_DIM, + ACTUAL_HEAD_DIM, + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + alibi_slope, + start_m, + start_n, + end_n, + num_steps, + descale_q, + descale_k, + descale_v, + descale_do, + MASK=False, + ENABLE_DROPOUT=ENABLE_DROPOUT, + USE_ALIBI=USE_ALIBI, + USE_EXP2=USE_EXP2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + DEBUG_TRITON=DEBUG_TRITON, + DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL, + ) + # Write back dQ. + adj_dq = bid * stride_dqb + hqid * stride_dqh + q_start * stride_dqm + offs_dq = offs_m[:, None] * stride_dqm + offs_d[None, :] * stride_dqd + dq *= sm_scale + tl.store(DQ + adj_dq + offs_dq, dq, mask=mask_q) + # end of GQA/MQA of dq + + +@triton.jit +def bwd_kernel_noncausal( + Q, + K, + V, + sm_scale, + DO, + DQ, + DK, + DV, + M, + Delta, + stride_qb_in, + stride_qh_in, + stride_qm_in, + stride_qd_in, + stride_kb_in, + stride_kh_in, + stride_kn_in, + stride_kd_in, + stride_vb_in, + stride_vh_in, + stride_vn_in, + stride_vd_in, + stride_dqb_in, + stride_dqh_in, + stride_dqm_in, + stride_dqd_in, + stride_dkb_in, + stride_dkh_in, + stride_dkn_in, + stride_dkd_in, + stride_dvb_in, + stride_dvh_in, + stride_dvn_in, + stride_dvd_in, + stride_deltab_in, + stride_deltah_in, + stride_deltam_in, + stride_dob_in, + stride_doh_in, + stride_dom_in, + stride_dod_in, + stride_dropoutb_in, + stride_dropouth_in, + stride_dropoutm_in, + stride_dropoutn_in, + stride_descale_q_z_in, + stride_descale_k_z_in, + stride_descale_v_z_in, + stride_descale_do_z_in, + stride_az_in, + stride_ah_in, + HQ, + HK, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + Dropout_mask, + dropout_p, + philox_seed, + philox_offset_base_in, + Alibi_slopes, + Descale_q, + Descale_k, + Descale_v, + Descale_do, + BLOCK_M1: tl.constexpr, # 32 + BLOCK_N1: tl.constexpr, # 128 + BLOCK_M2: tl.constexpr, # 128 + BLOCK_N2: tl.constexpr, # 32 + BLK_SLICE_FACTOR: tl.constexpr, + HEAD_DIM: tl.constexpr, + ACTUAL_HEAD_DIM: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + IS_VARLEN: tl.constexpr, + USE_ALIBI: tl.constexpr, + USE_EXP2: tl.constexpr, + IS_FP8: tl.constexpr, + FP8_MAX: tl.constexpr, + FP8_OUTPUT: tl.constexpr, + DEBUG_TRITON: tl.constexpr, + DEBUG_TRITON_DETAIL: tl.constexpr, + USE_INT64_STRIDES: tl.constexpr, +): + if USE_INT64_STRIDES: + stride_qb = tl.cast(stride_qb_in, tl.int64) + stride_qh = tl.cast(stride_qh_in, tl.int64) + stride_qm = tl.cast(stride_qm_in, tl.int64) + stride_qd = tl.cast(stride_qd_in, tl.int64) + stride_kb = tl.cast(stride_kb_in, tl.int64) + stride_kh = tl.cast(stride_kh_in, tl.int64) + stride_kn = tl.cast(stride_kn_in, tl.int64) + stride_kd = tl.cast(stride_kd_in, tl.int64) + stride_vb = tl.cast(stride_vb_in, tl.int64) + stride_vh = tl.cast(stride_vh_in, tl.int64) + stride_vn = tl.cast(stride_vn_in, tl.int64) + stride_vd = tl.cast(stride_vd_in, tl.int64) + stride_dqb = tl.cast(stride_dqb_in, tl.int64) + stride_dqh = tl.cast(stride_dqh_in, tl.int64) + stride_dqm = tl.cast(stride_dqm_in, tl.int64) + stride_dqd = tl.cast(stride_dqd_in, tl.int64) + stride_dkb = tl.cast(stride_dkb_in, tl.int64) + stride_dkh = tl.cast(stride_dkh_in, tl.int64) + stride_dkn = tl.cast(stride_dkn_in, tl.int64) + stride_dkd = tl.cast(stride_dkd_in, tl.int64) + stride_dvb = tl.cast(stride_dvb_in, tl.int64) + stride_dvh = tl.cast(stride_dvh_in, tl.int64) + stride_dvn = tl.cast(stride_dvn_in, tl.int64) + stride_dvd = tl.cast(stride_dvd_in, tl.int64) + stride_deltab = tl.cast(stride_deltab_in, tl.int64) + stride_deltah = tl.cast(stride_deltah_in, tl.int64) + stride_deltam = tl.cast(stride_deltam_in, tl.int64) + stride_dob = tl.cast(stride_dob_in, tl.int64) + stride_doh = tl.cast(stride_doh_in, tl.int64) + stride_dom = tl.cast(stride_dom_in, tl.int64) + stride_dod = tl.cast(stride_dod_in, tl.int64) + philox_offset_base = tl.cast(philox_offset_base_in, tl.int64) + stride_dropoutb = tl.cast(stride_dropoutb_in, tl.int64) + stride_dropouth = tl.cast(stride_dropouth_in, tl.int64) + stride_dropoutm = tl.cast(stride_dropoutm_in, tl.int64) + stride_dropoutn = tl.cast(stride_dropoutn_in, tl.int64) + if IS_FP8: + stride_descale_q_z = tl.cast(stride_descale_q_z_in, tl.int64) + stride_descale_k_z = tl.cast(stride_descale_k_z_in, tl.int64) + stride_descale_v_z = tl.cast(stride_descale_v_z_in, tl.int64) + stride_descale_do_z = tl.cast(stride_descale_do_z_in, tl.int64) + stride_az = tl.cast(stride_az_in, tl.int64) + stride_ah = tl.cast(stride_ah_in, tl.int64) + else: + stride_qb = stride_qb_in + stride_qh = stride_qh_in + stride_qm = stride_qm_in + stride_qd = stride_qd_in + stride_kb = stride_kb_in + stride_kh = stride_kh_in + stride_kn = stride_kn_in + stride_kd = stride_kd_in + stride_vb = stride_vb_in + stride_vh = stride_vh_in + stride_vn = stride_vn_in + stride_vd = stride_vd_in + stride_dqb = stride_dqb_in + stride_dqh = stride_dqh_in + stride_dqm = stride_dqm_in + stride_dqd = stride_dqd_in + stride_dkb = stride_dkb_in + stride_dkh = stride_dkh_in + stride_dkn = stride_dkn_in + stride_dkd = stride_dkd_in + stride_dvb = stride_dvb_in + stride_dvh = stride_dvh_in + stride_dvn = stride_dvn_in + stride_dvd = stride_dvd_in + stride_deltab = stride_deltab_in + stride_deltah = stride_deltah_in + stride_deltam = stride_deltam_in + stride_dob = stride_dob_in + stride_doh = stride_doh_in + stride_dom = stride_dom_in + stride_dod = stride_dod_in + philox_offset_base = philox_offset_base_in + stride_dropoutb = stride_dropoutb_in + stride_dropouth = stride_dropouth_in + stride_dropoutm = stride_dropoutm_in + stride_dropoutn = stride_dropoutn_in + stride_descale_q_z = stride_descale_q_z_in + stride_descale_k_z = stride_descale_k_z_in + stride_descale_v_z = stride_descale_v_z_in + stride_descale_do_z = stride_descale_do_z_in + stride_az = stride_az_in + stride_ah = stride_ah_in + + # program ids + hkid = tl.program_id(0) + pid = tl.program_id(1) + bid = tl.program_id(2) + if DEBUG_TRITON: + print(f"\npid: {pid}, bid: {bid}, hkid: {hkid}") # noqa: E701 + # figure out varlen start and end + q_start = 0 + k_start = 0 + seqlen_q = max_seqlen_q + seqlen_k = max_seqlen_k + if IS_VARLEN: + # Compute actual sequence lengths + q_start = tl.load(cu_seqlens_q + bid) + q_end = tl.load(cu_seqlens_q + bid + 1) + k_start = tl.load(cu_seqlens_k + bid) + k_end = tl.load(cu_seqlens_k + bid + 1) + seqlen_q = q_end - q_start + seqlen_k = k_end - k_start + + PADDED_HEAD: tl.constexpr = ACTUAL_HEAD_DIM != HEAD_DIM + offs_d = tl.arange(0, HEAD_DIM) + GROUP_SIZE: tl.constexpr = HQ // HK + + start_n = pid * BLOCK_N1 + if start_n < seqlen_k: + dk = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32) + dv = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32) + + offs_n = start_n + tl.arange(0, BLOCK_N1) + # Mask for loading K and V + mask_kv = offs_n[:, None] < seqlen_k + if PADDED_HEAD: + mask_d = offs_d < ACTUAL_HEAD_DIM + mask_kv &= mask_d[None, :] + # NOTE: don't assume that the strides for k and v are the same! + # K/V tensors not changed for the group + adj_k = ( + bid * stride_kb + + hkid * stride_kh + + k_start * stride_kn + + offs_n[:, None] * stride_kn + + offs_d[None, :] * stride_kd + ) + adj_v = ( + bid * stride_vb + + hkid * stride_vh + + k_start * stride_vn + + offs_n[:, None] * stride_vn + + offs_d[None, :] * stride_vd + ) + # load K and V: they stay in SRAM throughout the inner loop. + k = tl.load(K + adj_k, mask=mask_kv, other=0.0) + v = tl.load(V + adj_v, mask=mask_kv, other=0.0) + # If MQA / GQA, set the K and V head offsets appropriately. + for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE): + # offset input and output tensor by batch and Q/K heads + adj_q = bid * stride_qb + hqid * stride_qh + q_start * stride_qm + Q_ptr = Q + adj_q + adj_do = bid * stride_dob + hqid * stride_doh + q_start * stride_dom + DO_ptr = DO + adj_do + adj_delta = ( + bid * stride_deltab + hqid * stride_deltah + q_start * stride_deltam + ) + M_ptr = M + adj_delta + Delta_ptr = Delta + adj_delta + + if USE_ALIBI: + alibi_offset = bid * stride_az + hqid * stride_ah + alibi_slope = tl.load(Alibi_slopes + alibi_offset) + else: + alibi_slope = None + + # batch_philox_offset is the ACTUALLY dropout offset + # dropout_offset is for debug purpose and will be removed later + batch_philox_offset = 0 + dropout_offset = 0 + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset_base + bid * stride_dropoutb + hqid * stride_dropouth + ) + dropout_offset = ( + Dropout_mask + bid * stride_dropoutb + hqid * stride_dropouth + ) + + if IS_FP8: + descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid) + descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid) + descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid) + descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid) + else: + descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0 + + # because there is no causal, we always start from the beginning + start_m = 0 + num_steps = tl.cdiv(seqlen_q, BLOCK_M1) + dk, dv = _bwd_dkdv_inner( + dk, + dv, # output tensors + Q_ptr, + k, + v, + DO_ptr, + M_ptr, + Delta_ptr, + sm_scale, # input tensors + stride_qm, + stride_qd, # strides for q + stride_dom, + stride_dod, # strides for o + stride_dropoutm, + stride_dropoutn, # strides for dropout + stride_deltam, + BLOCK_M1, + BLOCK_N1, # block dim + HEAD_DIM, + ACTUAL_HEAD_DIM, # head dim + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, # + alibi_slope, + seqlen_q, + seqlen_k, # max sequence length for q and k + start_n, + start_m, + num_steps, # iteration numbers + descale_q, + descale_k, + descale_v, + descale_do, # fp8 descale factors from user + MASK=False, # causal masking + ENABLE_DROPOUT=ENABLE_DROPOUT, # activate dropout + USE_ALIBI=USE_ALIBI, + USE_EXP2=USE_EXP2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + DEBUG_TRITON=DEBUG_TRITON, + DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL, + ) + + # Write back dV + adj_dv = bid * stride_dvb + hkid * stride_dvh + k_start * stride_dvn + offs_dv = offs_n[:, None] * stride_dvn + offs_d[None, :] * stride_dvd + tl.store(DV + adj_dv + offs_dv, dv, mask=mask_kv) + # write back dk + adj_dk = bid * stride_dkb + hkid * stride_dkh + k_start * stride_dkn + offs_dk = offs_n[:, None] * stride_dkn + offs_d[None, :] * stride_dkd + dk *= sm_scale + tl.store(DK + adj_dk + offs_dk, dk, mask=mask_kv) + + # THIS PART DOES DQ + start_m = pid * BLOCK_M2 + if start_m < seqlen_q: + offs_m = start_m + tl.arange(0, BLOCK_M2) + # Mask for loading K and V + mask_q = offs_m[:, None] < seqlen_q + if PADDED_HEAD: + mask_d = offs_d < ACTUAL_HEAD_DIM + mask_q &= mask_d[None, :] + offs_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd + offs_do = offs_m[:, None] * stride_dom + offs_d[None, :] * stride_dod + K += bid * stride_kb + hkid * stride_kh + k_start * stride_kn + V += bid * stride_vb + hkid * stride_vh + k_start * stride_vn + # If MQA / GQA, set the K and V head offsets appropriately. + for hqid in range(hkid * GROUP_SIZE, hkid * GROUP_SIZE + GROUP_SIZE): + # offset input and output tensor by batch and Q/K heads + adj_q = bid * stride_qb + hqid * stride_qh + q_start * stride_qm + adj_do = bid * stride_dob + hqid * stride_doh + q_start * stride_dom + adj_delta = ( + bid * stride_deltab + hqid * stride_deltah + q_start * stride_deltam + ) + Delta_ptr = Delta + adj_delta + + if USE_ALIBI: + alibi_offset = bid * stride_az + hqid * stride_ah + alibi_slope = tl.load(Alibi_slopes + alibi_offset) + else: + alibi_slope = None + + # batch_philox_offset is the ACTUALLY dropout offset + # dropout_offset is for debug purpose and will be removed later + batch_philox_offset = 0 + dropout_offset = 0 + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset_base + bid * stride_dropoutb + hqid * stride_dropouth + ) + dropout_offset = ( + Dropout_mask + bid * stride_dropoutb + hqid * stride_dropouth + ) + + q = tl.load(Q + adj_q + offs_q, mask=mask_q, other=0.0) + do = tl.load(DO + adj_do + offs_do, mask=mask_q, other=0.0) + m = tl.load(M + adj_delta + offs_m * stride_deltam, mask=offs_m < seqlen_q) + m = m[:, None] + + if IS_FP8: + descale_q = tl.load(Descale_q + bid * stride_descale_q_z + hqid) + descale_k = tl.load(Descale_k + bid * stride_descale_k_z + hkid) + descale_v = tl.load(Descale_v + bid * stride_descale_v_z + hkid) + descale_do = tl.load(Descale_do + bid * stride_descale_do_z + hqid) + else: + descale_q, descale_k, descale_v, descale_do = 1.0, 1.0, 1.0, 1.0 + + # start can only be 0 at minimum + start_n = 0 + end_n = seqlen_k + num_steps = tl.cdiv(seqlen_k, BLOCK_N2) + + dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32) + dq = _bwd_dq_inner( + dq, + q, + K, + V, + do, + m, + Delta_ptr, + sm_scale, + stride_qm, + stride_qd, + stride_kn, + stride_kd, + stride_vn, + stride_vd, + stride_dropoutm, + stride_dropoutn, + stride_deltam, + seqlen_q, + seqlen_k, + BLOCK_M2, + BLOCK_N2, + HEAD_DIM, + ACTUAL_HEAD_DIM, + dropout_p, + philox_seed, + batch_philox_offset, + dropout_offset, + alibi_slope, + start_m, + start_n, + end_n, + num_steps, + descale_q, + descale_k, + descale_v, + descale_do, + MASK=False, + ENABLE_DROPOUT=ENABLE_DROPOUT, + USE_ALIBI=USE_ALIBI, + USE_EXP2=USE_EXP2, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + DEBUG_TRITON=DEBUG_TRITON, + DEBUG_TRITON_DETAIL=DEBUG_TRITON_DETAIL, + ) + # Write back dQ. + adj_dq = bid * stride_dqb + hqid * stride_dqh + q_start * stride_dqm + offs_dq = offs_m[:, None] * stride_dqm + offs_d[None, :] * stride_dqd + dq *= sm_scale + tl.store(DQ + adj_dq + offs_dq, dq, mask=mask_q) + + +def is_contiguous(x, name): + if x.is_contiguous(): + return x + else: + print(f"{name} is not contiguous") + return x.contiguous() + + +@functools.lru_cache(maxsize=1024) +def _get_config(): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/{dev}-MHA-DEFAULT.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + return _get_config._config_dict["bkwd_onekernel"] + + +def flash_attn_onekernel_backward( + do: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + o: torch.Tensor, + softmax_lse: torch.Tensor, + dq: torch.Tensor, + dk: torch.Tensor, + dv: torch.Tensor, + dbias: torch.Tensor, + sm_scale: float, + alibi_slopes: Optional[torch.Tensor], + causal: bool, + cu_seqlens_q: Optional[torch.Tensor], + cu_seqlens_k: Optional[torch.Tensor], + max_seqlen_q: int, + max_seqlen_k: int, + dropout_p: float, + philox_seed: Optional[int] = 0, + philox_offset: Optional[int] = 0, + descale_q: Optional[torch.Tensor] = None, + descale_k: Optional[torch.Tensor] = None, + descale_v: Optional[torch.Tensor] = None, + descale_do: Optional[torch.Tensor] = None, + USE_INT64_STRIDES: Optional[bool] = False, + config: Optional[Dict[str, any]] = None, +): + if dbias is not None: + raise ValueError("Bias is not supported yet in the Triton Backend") + + use_alibi, (stride_az, stride_ah) = ( + (True, alibi_slopes.stride()) if alibi_slopes is not None else (False, (0, 0)) + ) + + IS_FP8 = _is_fp8(q) + if IS_FP8: + FP8_MAX = torch.finfo(q.dtype).max + descale_strides = ( + descale_q.stride(0), + descale_k.stride(0), + descale_v.stride(0), + descale_do.stride(0), + ) + else: + FP8_MAX = None + stride_descale_q_z = stride_descale_k_z = stride_descale_v_z = ( + stride_descale_do_z + ) = None + descale_strides = ( + stride_descale_q_z, + stride_descale_k_z, + stride_descale_v_z, + stride_descale_do_z, + ) + + IS_VARLEN = True if cu_seqlens_q is not None else False + + # get strides and shape + if IS_VARLEN: + # Layout for q,k,v is thd ie [total tokens, num_head, head_dim] + batch, seqlen_q, num_q_heads, head_sz = ( + len(cu_seqlens_q) - 1, + max_seqlen_q, + q.shape[1], + q.shape[2], + ) + _, num_k_heads = max_seqlen_k, k.shape[1] + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + dq_strides = (0, dq.stride(1), dq.stride(0), dq.stride(2)) + dk_strides = (0, dk.stride(1), dk.stride(0), dk.stride(2)) + dv_strides = (0, dv.stride(1), dv.stride(0), dv.stride(2)) + do_strides = (0, do.stride(1), do.stride(0), do.stride(2)) + else: + # Layout for q,k,v is bshd ie [batch, seq_len, num_head, head_dim] + batch, seqlen_q, num_q_heads, head_sz = q.shape + _, num_k_heads = k.shape[1], k.shape[2] + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) + dq_strides = (dq.stride(0), dq.stride(2), dq.stride(1), dq.stride(3)) + dk_strides = (dk.stride(0), dk.stride(2), dk.stride(1), dk.stride(3)) + dv_strides = (dv.stride(0), dv.stride(2), dv.stride(1), dv.stride(3)) + do_strides = (do.stride(0), do.stride(2), do.stride(1), do.stride(3)) + + # BLOCK_D_MODEL, BLOCK_D_MODEL_POW2 + # padding for head_dim. Power of 2 or 16 + BLOCK_D_MODEL_POW2 = triton.next_power_of_2(head_sz) + BLOCK_D_MODEL_POW2 = max(BLOCK_D_MODEL_POW2, 16) + + # Configs + if config is None: + config = _get_config() + + # init delta + delta = torch.zeros_like(softmax_lse) + if IS_VARLEN: + # [total_tokens, num_q_heads, seqlen_q] + delta_strides = (0, delta.stride(1), delta.stride(0)) + else: + # [batch, num_q_heads, seqlen_q] + delta_strides = delta.stride() + + # preprocess + # compute D(delta) = rowsum(dO*O). Note, multiplication is element-wise. + pre_grid = ( + triton.cdiv(max_seqlen_q, config["preprocess_kernel"]["PRE_BLOCK"]), + batch, + num_q_heads, + ) + _bwd_preprocess[pre_grid]( + o, + do, + delta, + *o_strides, + *delta_strides, + descale_strides[3], + cu_seqlens_q, + max_seqlen_q, + descale_do, + BLOCK_M=config["preprocess_kernel"]["PRE_BLOCK"], + BLOCK_D_MODEL=head_sz, + BLOCK_D_MODEL_POW2=BLOCK_D_MODEL_POW2, + IS_VARLEN=IS_VARLEN, + IS_FP8=IS_FP8, + ) + + # dropout_mask + use_dropout = dropout_p > 0.0 + if use_dropout: + dropout_mask = torch.zeros( + (batch, num_q_heads, max_seqlen_q, max_seqlen_k), + device=q.device, + dtype=torch.float32, + ) + dropout_strides = dropout_mask.stride() + else: + dropout_mask = None + dropout_strides = (0, 0, 0, 0) + + seqlen = max(max_seqlen_q, max_seqlen_k) + + config_onekernel = config["onekernel"] + grid = ( + num_k_heads, + triton.cdiv(seqlen, config_onekernel["BLOCK_N1"]), + batch, + ) + + if causal: + bwd_kernel_causal[grid]( + q, + k, + v, + sm_scale, + do, + dq, + dk, + dv, + softmax_lse, + delta, + *q_strides, + *k_strides, + *v_strides, + *dq_strides, + *dk_strides, + *dv_strides, + *delta_strides, + *do_strides, + *dropout_strides, + *descale_strides, + stride_az, + stride_ah, + num_q_heads, + num_k_heads, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_mask, + dropout_p, + philox_seed, + philox_offset, + alibi_slopes, + descale_q, + descale_k, + descale_v, + descale_do, + HEAD_DIM=head_sz, + ACTUAL_HEAD_DIM=BLOCK_D_MODEL_POW2, + ENABLE_DROPOUT=use_dropout, + IS_VARLEN=IS_VARLEN, + USE_ALIBI=use_alibi, + USE_EXP2=True, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + FP8_OUTPUT=False, + DEBUG_TRITON=False, + DEBUG_TRITON_DETAIL=False, + USE_INT64_STRIDES=USE_INT64_STRIDES, + **config_onekernel, + ) + else: + bwd_kernel_noncausal[grid]( + q, + k, + v, + sm_scale, + do, + dq, + dk, + dv, + softmax_lse, + delta, + *q_strides, + *k_strides, + *v_strides, + *dq_strides, + *dk_strides, + *dv_strides, + *delta_strides, + *do_strides, + *dropout_strides, + *descale_strides, + stride_az, + stride_ah, + num_q_heads, + num_k_heads, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_mask, + dropout_p, + philox_seed, + philox_offset, + alibi_slopes, + descale_q, + descale_k, + descale_v, + descale_do, + HEAD_DIM=head_sz, + ACTUAL_HEAD_DIM=BLOCK_D_MODEL_POW2, + ENABLE_DROPOUT=use_dropout, + IS_VARLEN=IS_VARLEN, + USE_ALIBI=use_alibi, + USE_EXP2=True, + IS_FP8=IS_FP8, + FP8_MAX=FP8_MAX, + FP8_OUTPUT=False, + DEBUG_TRITON=False, + DEBUG_TRITON_DETAIL=False, + USE_INT64_STRIDES=USE_INT64_STRIDES, + **config_onekernel, + ) + + return delta diff --git a/aiter/ops/triton/mla_decode_rope.py b/aiter/ops/triton/mla_decode_rope.py new file mode 100644 index 0000000000000000000000000000000000000000..f936e327d16fbf76a8d34f5a30b9eb83e225c350 --- /dev/null +++ b/aiter/ops/triton/mla_decode_rope.py @@ -0,0 +1,551 @@ +# SPDX-License-Identifier: MIT + +# Copyright (C) 2023-2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Memory-efficient attention for decoding. +It supports page size = 1. +""" + +# Adapted from +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py + +from typing import Optional +import functools +import json +import triton +import triton.language as tl +import torch +from aiter.ops.triton.activation import _tanh +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.jit +def _fwd_grouped_kernel_stage1_rope( + Q, # Holds [Q_NOPE; Q_PE], b x h x (d+r) + K_Buffer, # Holds [KV; K_PE], b*s x (c+r) + V_buffer, # Holds [KV], b*s x (c) + cos_sin_cache, # max_seq_len x (rotary_dim * 2) + positions, # sequence positions + sm_scale, + kv_indptr, + kv_indices, + Att_Out, # b x h x NUM_KV_SPLITS x (kv_lora_rank + 1) + k_pe_t_out, + stride_qb, + stride_qh, + stride_buf_kbs, + stride_buf_vbs, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + stride_kpe_tokens_out_b, + stride_cos_sin_cache_s, + stride_positions_b, + rotary_dim: tl.constexpr, + kv_lora_rank: tl.constexpr, + qk_rope_head_dim: tl.constexpr, + kv_group_num: tl.constexpr, + q_head_num: tl.constexpr, + BLOCK_C: tl.constexpr, + BLOCK_R: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_H: tl.constexpr, + NUM_KV_SPLITS: tl.constexpr, + logit_cap: tl.constexpr, + USE_ROPE: tl.constexpr, + IS_NEOX_STYLE: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head_id = tl.program_id(1) + split_kv_id = tl.program_id(2) + + if BLOCK_H < kv_group_num: + VALID_BLOCK_H: tl.constexpr = BLOCK_H + else: + VALID_BLOCK_H: tl.constexpr = kv_group_num + cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H) + mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H + mask_h = mask_h & (cur_head < q_head_num) + + offs_c = tl.arange(0, BLOCK_C) + offs_qk_r = tl.arange(kv_lora_rank, kv_lora_rank + BLOCK_R) # to get the k_pe + + off_q_pe = ( + cur_batch * stride_qb + cur_head[:, None] * stride_qh + offs_qk_r[None, :] + ) + offs_q = cur_batch * stride_qb + cur_head[:, None] * stride_qh + offs_c[None, :] + + mask_c = offs_c < kv_lora_rank + mask_qk_r = offs_qk_r < (kv_lora_rank + qk_rope_head_dim) + + cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch) + cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx + + q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_c[None, :]), other=0.0) + q_pe = tl.load( + Q + off_q_pe, mask=(mask_h[:, None]) & (mask_qk_r[None, :]), other=0.0 + ) + + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + # apply rotary embedding for q_pe, and k_pe (last token per batch of K_PE) + LAST_SPLIT = split_kv_end == cur_batch_seq_len + k_pe_last_token = tl.zeros([BLOCK_R], dtype=q.dtype) + + if USE_ROPE: + if IS_NEOX_STYLE: + # [BLOCK_ROTARY // 2, BLOCK_ROTARY // 2 + 1, BLOCK_ROTARY // 2 + 2, ..., 0, 1, 2, ..., BLOCK_ROTARY // 2 - 1, pass:] + offs_qk_rot_r = kv_lora_rank + ( + (tl.arange(0, BLOCK_R) + (rotary_dim // 2)) % rotary_dim + ) + # Which elements to flip + mask_rotate = tl.arange(0, BLOCK_R) < (rotary_dim // 2) + # [0 , 1, 2, ..., rotary_dim // 2 - 1, 0 , 1, 2, ..., rotary_dim // 2 - 1] + offs_rotary = tl.arange(0, BLOCK_R) % (rotary_dim // 2) + else: + # [1, 0, 3, 2, 5, 4, ..., BLOCK_R, BLOCK_R - 1] + offs_qk_rot_r = ( + kv_lora_rank + + (((tl.arange(0, BLOCK_R) + 1) % 2) * 2) + - 1 + + tl.arange(0, BLOCK_R) + ) + mask_rotate = tl.arange(0, BLOCK_R) % 2 < 1 + # [0, 0, 1, 1, ..., rotary_dim // 2 - 1, rotary_dim // 2 - 1] + offs_rotary = tl.arange(0, BLOCK_R) // 2 + + if qk_rope_head_dim > rotary_dim: + offs_qk_rot_r = tl.where( + tl.arange(0, BLOCK_R) < rotary_dim, offs_qk_rot_r, tl.arange(0, BLOCK_R) + ) + offs_rotary = tl.where( + tl.arange(0, BLOCK_R) < rotary_dim, offs_rotary, tl.arange(0, BLOCK_R) + ) + + mask_rotary = tl.arange(0, BLOCK_R) < rotary_dim + + pos = tl.load(positions + cur_batch * stride_positions_b) + cos = tl.load( + cos_sin_cache + pos * stride_cos_sin_cache_s + offs_rotary, + mask=mask_rotary, + other=1.0, + ) + sin = tl.load( + cos_sin_cache + + pos * stride_cos_sin_cache_s + + offs_rotary + + rotary_dim // 2, + mask_rotary, + other=0.0, + ) + + off_q_pe_rot = ( + cur_batch * stride_qb + + cur_head[:, None] * stride_qh + + offs_qk_rot_r[None, :] + ) + mask_qk_rot_r = offs_qk_rot_r < (kv_lora_rank + qk_rope_head_dim) + + # 0, 2, 4,.... 1, 3, 5... + q_pe_rot = tl.load( + Q + off_q_pe_rot, + mask=(mask_h[:, None]) & (mask_qk_rot_r[None, :]), + other=0.0, + ) + q_pe_rot = tl.where(mask_rotate[None, :], -q_pe_rot, q_pe_rot) + + q_pe = q_pe * cos + q_pe_rot * sin + + # we only apply to the last token in the K_PE + if LAST_SPLIT: + # debug assert + if ( + cur_batch == 0 and cur_head_id == 0 + ) and split_kv_id < NUM_KV_SPLITS - 1: + tl.device_assert(False, "Only last split should compute k_pe") + + kv_loc = tl.load( + kv_indices + cur_batch_kv_start_idx + cur_batch_seq_len - 1 + ) + offs_buf_k_pe_last_token = kv_loc * stride_buf_kbs + offs_qk_r + offs_buf_k_pe_rot_last_token = kv_loc * stride_buf_kbs + offs_qk_rot_r + k_pe_last_token = tl.load(K_Buffer + offs_buf_k_pe_last_token) + + k_pe_rot_last_token = tl.load(K_Buffer + offs_buf_k_pe_rot_last_token) + k_pe_rot_last_token = tl.where( + mask_rotate, -k_pe_rot_last_token, k_pe_rot_last_token + ) + + k_pe_last_token = k_pe_last_token * cos + k_pe_rot_last_token * sin + + e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf") + e_sum = tl.zeros([BLOCK_H], dtype=tl.float32) + acc = tl.zeros([BLOCK_H, BLOCK_C], dtype=tl.float32) + + if split_kv_end > split_kv_start: + for start_n in range(split_kv_start, split_kv_end, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + kv_loc = tl.load( + kv_indices + cur_batch_kv_start_idx + offs_n, + mask=offs_n < split_kv_end, + other=0, + ) + + offs_buf_kv = kv_loc[None, :] * stride_buf_kbs + offs_c[:, None] + offs_buf_k_pe = kv_loc[None, :] * stride_buf_kbs + offs_qk_r[:, None] + + k_pe = tl.load( + K_Buffer + offs_buf_k_pe, + mask=(offs_n[None, :] < split_kv_end) & (mask_qk_r[:, None]), + other=0.0, + ) # positional embedding part of keys + + if (USE_ROPE and LAST_SPLIT) and start_n >= cur_batch_seq_len - BLOCK_N: + k_pe = tl.where( + offs_n[None, :] != (split_kv_end - 1), + k_pe, + k_pe_last_token[:, None], + ) + + # (16, 64) x (64, 32) + # dot product of rope parts + qk = tl.dot(q_pe, k_pe.to(q_pe.dtype)) + + kv = tl.load( + K_Buffer + offs_buf_kv, + mask=(offs_n[None, :] < split_kv_end) & (mask_c[:, None]), + other=0.0, + ) # the shared latent tensor for keys and values + + # (16, 512) x (512, 32) + # dot product of nope parts + qk += tl.dot(q, kv) + + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * _tanh(qk / logit_cap) + + qk = tl.where( + mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf") + ) + + offs_buf_v = kv_loc[:, None] * stride_buf_vbs + offs_c[None, :] + v = tl.load( + V_buffer + offs_buf_v, + mask=(offs_n[:, None] < split_kv_end) & (mask_c[None, :]), + other=0.0, + ) + + n_e_max = tl.maximum(tl.max(qk, 1), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max[:, None]) + acc *= re_scale[:, None] + # (16, 32) x (32, 512) + acc += tl.dot(p.to(v.dtype), v) + + e_sum = e_sum * re_scale + tl.sum(p, 1) + e_max = n_e_max + + offs_mid_o = ( + cur_batch * stride_mid_ob + + cur_head[:, None] * stride_mid_oh + + split_kv_id * stride_mid_os + + offs_c[None, :] + ) + + if USE_ROPE: + if LAST_SPLIT: + k_pe_last_token_ptrs = ( + k_pe_t_out + + cur_batch * stride_kpe_tokens_out_b + + tl.arange(0, BLOCK_R) + ) + tl.store(k_pe_last_token_ptrs, k_pe_last_token, mask=mask_qk_r) + + tl.store( + Att_Out + offs_mid_o, + acc / e_sum[:, None], + mask=(mask_h[:, None]) & (mask_c[None, :]), + ) + + offs_mid_o_1 = ( + cur_batch * stride_mid_ob + + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + + kv_lora_rank + ) + + tl.store( + Att_Out + offs_mid_o_1, + e_max + tl.log(e_sum), + mask=mask_h, + ) + + +# TODO rope offset +def _decode_grouped_att_m_fwd_rope( + q, + k_buffer, + v_buffer, + att_out, + k_pe_tokens_out, + kv_lora_rank, # c + cos_sin_cache, + positions, + rotary_dim, + kv_indptr, + kv_indices, + num_kv_splits, + sm_scale, + logit_cap, + use_rope, + is_neox_style, + config, +): + if use_rope: + assert ( + k_pe_tokens_out is not None + ), "We must output the k_pe tokens with rope applied if rope fusion enabled." + + qk_rope_head_dim = k_buffer.shape[-1] - kv_lora_rank + batch, head_num = kv_indptr.shape[0] - 1, q.shape[1] + kv_group_num = q.shape[1] // k_buffer.shape[1] + + config["BLOCK_C"] = triton.next_power_of_2(kv_lora_rank) + config["BLOCK_R"] = triton.next_power_of_2(qk_rope_head_dim) + + config["NUM_KV_SPLITS"] = num_kv_splits + grid = ( + batch, + triton.cdiv(head_num, min(config["BLOCK_H"], kv_group_num)), + config["NUM_KV_SPLITS"], + ) + + _fwd_grouped_kernel_stage1_rope[grid]( + q, + k_buffer, + v_buffer, + cos_sin_cache, + positions, + sm_scale, + kv_indptr, + kv_indices, + att_out, + k_pe_tokens_out, + q.stride(0), + q.stride(1), + k_buffer.stride(0), + v_buffer.stride(0), + att_out.stride(0), + att_out.stride(1), + att_out.stride(2), + k_pe_tokens_out.stride(0) if use_rope else 0, + cos_sin_cache.stride(0) if use_rope else 0, + positions.stride(0) if use_rope else 0, + rotary_dim, + kv_lora_rank, + qk_rope_head_dim, + kv_group_num=kv_group_num, + q_head_num=head_num, + logit_cap=logit_cap, + USE_ROPE=use_rope, + IS_NEOX_STYLE=is_neox_style, + **config, + ) + + +@triton.jit +def _fwd_kernel_stage2( + Mid_O, + O, + kv_indptr, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + stride_obs, + stride_oh, + NUM_KV_SPLITS: tl.constexpr, + BLOCK_DV: tl.constexpr, + Lv: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + + cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - tl.load( + kv_indptr + cur_batch + ) + + offs_d = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lv + + e_sum = 0.0 + e_max = -float("inf") + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d + offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv + + for split_kv_id in range(0, NUM_KV_SPLITS): + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + if split_kv_end > split_kv_start: + tv = tl.load( + Mid_O + offs_v + split_kv_id * stride_mid_os, mask=mask_d, other=0.0 + ) + tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os) + n_e_max = tl.maximum(tlogic, e_max) + + old_scale = tl.exp(e_max - n_e_max) + acc *= old_scale + exp_logic = tl.exp(tlogic - n_e_max) + acc += exp_logic * tv + + e_sum = e_sum * old_scale + exp_logic + e_max = n_e_max + + tl.store( + O + cur_batch * stride_obs + cur_head * stride_oh + offs_d, + acc / e_sum, + mask=mask_d, + ) + + +def _decode_softmax_reducev_fwd( + logits, + q, + o, + v_buffer, + kv_indptr, + num_kv_splits, + config, +): + batch, head_num = q.shape[0], q.shape[1] + Lv = v_buffer.shape[-1] + config["BLOCK_DV"] = triton.next_power_of_2(Lv) + + config["NUM_KV_SPLITS"] = num_kv_splits + + grid = (batch, head_num) + _fwd_kernel_stage2[grid]( + logits, + o, + kv_indptr, + logits.stride(0), + logits.stride(1), + logits.stride(2), + o.stride(0), + o.stride(1), + Lv=Lv, + **config, + ) + + +@functools.lru_cache(maxsize=1024) +def _get_config(): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/{dev}-MLA_DECODE_ROPE-DEFAULT.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + return _get_config._config_dict + + +def decode_attention_fwd_grouped_rope( + q: torch.Tensor, + k_buffer: torch.Tensor, + v_buffer: torch.Tensor, + o: torch.Tensor, + kv_indptr: torch.Tensor, + kv_indices: torch.Tensor, + k_pe_tokens: torch.Tensor, + kv_lora_rank: int, + rotary_dim: int, + cos_sin_cache: torch.Tensor, + positions: torch.Tensor, + attn_logits: torch.Tensor, + num_kv_splits: int, + sm_scale: float, + logit_cap: Optional[float] = 0.0, + use_rope: Optional[bool] = False, + is_neox_style: Optional[bool] = False, + config: Optional[dict[str, any]] = None, +): + """ + Implements deepseek decode attention with grouped query attention and rotary positional encoding + + parameters: + q: Query Tensor + k_buffer: Key Cache Tensor + v_buffer: Value Cache Tensor + o: Output tensor containing the result of decode. Allocated by the caller + kv_indptr: + kv_indices: + k_pe_tokens: + kv_lora_rank: + rotary_dim + cos_sin_cache: + positions: + attn_logits: + num_kv_splits: + sm_scale + logit_cap: + use_rope + is_neox_style + + Returns: + o: output Tensor + + """ + if config is None: + config = _get_config() + + _decode_grouped_att_m_fwd_rope( + q, + k_buffer, + v_buffer, + attn_logits, + k_pe_tokens, + kv_lora_rank, + cos_sin_cache, + positions, + rotary_dim, + kv_indptr, + kv_indices, + num_kv_splits, + sm_scale, + logit_cap, + use_rope, + is_neox_style, + config["fwd_grouped_kernel_stage1_rope"], + ) + _decode_softmax_reducev_fwd( + attn_logits, + q, + o, + v_buffer, + kv_indptr, + num_kv_splits, + config["fwd_kernel_stage2"], + ) diff --git a/aiter/ops/triton/moe_align_block_size.py b/aiter/ops/triton/moe_align_block_size.py new file mode 100644 index 0000000000000000000000000000000000000000..b5c13ee9d521a60b30c069082741a72e84852197 --- /dev/null +++ b/aiter/ops/triton/moe_align_block_size.py @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl + + +def ceil_div(a, b): + return (a + b - 1) // b + + +@triton.jit +def _moe_align_block_size_stage1_kernel( + topk_ids_ptr, + tokens_cnts_ptr, + num_experts: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + + start_idx = pid * tokens_per_thread + + off_c = (pid + 1) * num_experts + + for i in range(tokens_per_thread): + if start_idx + i < numel: + idx = tl.load(topk_ids_ptr + start_idx + i) + token_cnt = tl.load(tokens_cnts_ptr + off_c + idx) + tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1) + + +@triton.jit +def _moe_align_block_size_stage2_kernel( + tokens_cnts_ptr, + num_experts: tl.constexpr, +): + pid = tl.program_id(0) + + last_cnt = 0 + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid) + last_cnt = last_cnt + token_cnt + tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt) + + +@triton.jit +def _moe_align_block_size_stage3_kernel( + total_tokens_post_pad_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, +): + last_cumsum = 0 + off_cnt = num_experts * num_experts + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1) + last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size + tl.store(cumsum_ptr + i, last_cumsum) + tl.store(total_tokens_post_pad_ptr, last_cumsum) + + +@triton.jit +def _moe_align_block_size_stage4_kernel( + topk_ids_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + start_idx = tl.load(cumsum_ptr + pid) + end_idx = tl.load(cumsum_ptr + pid + 1) + + for i in range(start_idx, end_idx, block_size): + tl.store(expert_ids_ptr + i // block_size, pid) + + start_idx = pid * tokens_per_thread + off_t = pid * num_experts + + for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, numel)): + expert_id = tl.load(topk_ids_ptr + i) + token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id) + rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id) + tl.store(sorted_token_ids_ptr + rank_post_pad, i) + tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1) + + +def moe_align_block_size_triton( + topk_ids: torch.Tensor, # [num_tkns, num_experts] + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + numel = topk_ids.numel() + grid = (num_experts,) + tokens_cnts = torch.zeros( + (num_experts + 1, num_experts), dtype=torch.int32, device=topk_ids.device + ) + cumsum = torch.zeros((num_experts + 1,), dtype=torch.int32, device=topk_ids.device) + tokens_per_thread = ceil_div(numel, num_experts) + + _moe_align_block_size_stage1_kernel[grid]( + topk_ids, + tokens_cnts, + num_experts, + numel, + tokens_per_thread, + ) + + _moe_align_block_size_stage2_kernel[grid]( + tokens_cnts, + num_experts, + ) + + _moe_align_block_size_stage3_kernel[(1,)]( + num_tokens_post_pad, + tokens_cnts, + cumsum, + num_experts, + block_size, + ) + + _moe_align_block_size_stage4_kernel[grid]( + topk_ids, + sorted_token_ids, + expert_ids, + tokens_cnts, + cumsum, + num_experts, + block_size, + numel, + tokens_per_thread, + ) diff --git a/aiter/ops/triton/moe_op.py b/aiter/ops/triton/moe_op.py new file mode 100644 index 0000000000000000000000000000000000000000..92a7e2549430f4a0ea83813b16e97f29becdbbd0 --- /dev/null +++ b/aiter/ops/triton/moe_op.py @@ -0,0 +1,2909 @@ +# SPDX-License-Identifier: MIT + +import functools +import json +import os +import torch +import triton +import triton.language as tl +from typing import Any, Dict, Optional, List +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter import logger +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.jit.utils.chip_info import get_cu_num +from aiter.ops.triton.utils.moe_config_utils import get_optimal_moe_config_func + +@functools.lru_cache() +def distinguish_moe_kernel_name(): + training_mode = os.environ.get("TRAINING_MODE", "0") == "1" + return not training_mode + +@functools.lru_cache() +def support_mls(): + return arch_info.get_arch() in ("gfx938") + +capMLS = support_mls() +splitk_size = int(os.environ.get("SPLITK_SIZE", "0")) + + +# def get_splitk_reduce_config(M, N, SPLIT_K): +# """Get optimal configuration for splitk reduce kernel""" +# if M < 32: +# config = {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, "num_warps": 4} +# elif M < 128: +# config = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, "num_warps": 8} +# else: +# config = {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, "num_warps": 8} +# return config + +# def generate_splitk_reduce_configs(): +# configs = [] +# for block_m in [1, 2, 4, 16]: +# for block_n in [64, 128, 256]: +# for num_warps in [2, 4]: +# for num_stages in [1, 2]: +# config = triton.Config({ +# 'BLOCK_SIZE_M': block_m, +# 'BLOCK_SIZE_N': block_n, +# }, num_warps=num_warps, num_stages=num_stages) +# configs.append(config) +# return configs + +# @triton.autotune( +# key=['M', 'N', 'top_k','compute_type'], +# configs=generate_splitk_reduce_configs(), +# perf_debug=True, +# ) +# @triton.heuristics({ +# 'block_m_dividable': lambda nargs: nargs['M'] % nargs['BLOCK_SIZE_M'] == 0, +# 'block_n_dividable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, +# }) +# @triton.jit +# def splitk_reduce_kernel( +# # Pointers to matrices +# output_ptr, # [M, N] +# input_ptr, # [SPLIT_K, M, N] +# # Matrix dimensions +# M, +# N: tl.constexpr, +# SPLIT_K: tl.constexpr, +# # Meta-parameters +# BLOCK_SIZE_M: tl.constexpr, +# BLOCK_SIZE_N: tl.constexpr, +# stride_k, +# stride_m, +# stride_n, +# compute_type: tl.constexpr, +# block_m_dividable: tl.constexpr, +# block_n_dividable: tl.constexpr, +# ): +# """ +# Reduce splitk_cache along the first dimension (SPLIT_K dimension). + +# Args: +# output_ptr: shape [M, N] +# input_ptr: shape [SPLIT_K, M, N] +# """ +# tl.assume(stride_k >= 0) +# tl.assume(stride_m >= 0) +# tl.assume(stride_n >= 0) + +# pid = tl.program_id(axis=0) + +# num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) +# num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + +# pid_m = pid // num_pid_n +# pid_n = pid % num_pid_n + +# offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) +# offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + +# mask_m = offs_m < M +# mask_n = offs_n < N + +# # Accumulate in float32 for higher precision +# acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + +# # Sum over SPLIT_K dimension +# for k in range(SPLIT_K): +# input_ptrs = input_ptr + (k * stride_k + +# offs_m[:, None] * stride_m + +# offs_n[None, :] * stride_n) +# if block_m_dividable and block_n_dividable: +# x = tl.load(input_ptrs) +# elif block_n_dividable: +# x = tl.load(input_ptrs, mask=mask_n[None, :], other=0.0) +# elif block_m_dividable: +# x = tl.load(input_ptrs, mask=mask_m[:, None], other=0.0) +# else: +# x = tl.load(input_ptrs, mask=mask_m[:, None] & mask_n[None, :], other=0.0) +# acc += x.to(tl.float32) + +# # Convert to target compute type +# acc = acc.to(compute_type) + +# output_ptrs = output_ptr + (offs_m[:, None] * stride_m + +# offs_n[None, :] * stride_n) + +# if block_m_dividable and block_n_dividable: +# tl.store(output_ptrs, acc) +# elif block_n_dividable: +# tl.store(output_ptrs, acc, mask=mask_n[None, :]) +# elif block_m_dividable: +# tl.store(output_ptrs, acc, mask=mask_m[:, None]) +# else: +# tl.store(output_ptrs, acc, mask=mask_m[:, None] & mask_n[None, :]) + + +# def triton_splitk_reduce(input_tensor, output_tensor): +# """ +# High-performance triton kernel for reducing splitk_cache. + +# Args: +# input_tensor: [SPLIT_K, M, N] - splitk_cache +# output_tensor: [M, N] - output tensor C +# """ +# SPLIT_K, M, N = input_tensor.shape + +# config = get_splitk_reduce_config(M, N, SPLIT_K) +# grid = (triton.cdiv(M, config['BLOCK_SIZE_M']) * triton.cdiv(N, config['BLOCK_SIZE_N']),) + +# grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv( +# N, META['BLOCK_SIZE_N']), ) + +# # Check constraints +# assert output_tensor.dtype == torch.float16 or \ +# output_tensor.dtype == torch.bfloat16 or \ +# output_tensor.dtype == torch.float32 + +# if output_tensor.dtype == torch.float16: +# compute_type = tl.float16 +# elif output_tensor.dtype == torch.bfloat16: +# compute_type = tl.bfloat16 +# elif output_tensor.dtype == torch.float32: +# compute_type = tl.float32 +# else: +# compute_type = tl.float32 # Default to float32 + +# assert input_tensor.is_contiguous() +# assert output_tensor.is_contiguous() +# assert input_tensor.shape[1] == output_tensor.shape[0] +# assert input_tensor.shape[2] == output_tensor.shape[1] + +# splitk_reduce_kernel[grid]( +# output_tensor, +# input_tensor, +# M, +# N, +# SPLIT_K, +# stride_k = input_tensor.stride(0), +# stride_m = input_tensor.stride(1), +# stride_n = input_tensor.stride(2), +# compute_type=compute_type, +# **config, +# ) + +# return output_tensor + + +@triton.jit +def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token, + token_mask, BLOCK_SIZE_M, BLOCK_SIZE_N, + compute_type): + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + (stride_cm * offs_token[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + +@triton.jit +def pid_grid(pid: int, num_pid_m: int, num_pid_n: int, GROUP_SIZE_M: tl.constexpr = 1): + """ + Maps 1D pid to 2D grid coords (pid_m, pid_n). + + Args: + - pid: 1D pid + - num_pid_m: grid m size + - num_pid_n: grid n size + - GROUP_SIZE_M: tl.constexpr: default is 1 + """ + if GROUP_SIZE_M == 1: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + else: + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + tl.assume(group_size_m >= 0) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + return pid_m, pid_n + +@triton.jit +def remap_xcd(pid, GRID_MN, NUM_XCDS: tl.constexpr = 4): + ## pid remapping on xcds + + # Optimization: when NUM_XCDS=1, no remapping is needed + if NUM_XCDS == 1: + return pid + + # Number of pids per XCD in the new arrangement + pids_per_xcd = (GRID_MN + NUM_XCDS - 1) // NUM_XCDS + # When GRID_MN cannot divide NUM_XCDS, some xcds will have + # pids_per_xcd pids, the other will have pids_per_xcd - 1 pids. + # We calculate the number of xcds that have pids_per_xcd pids as + # tall_xcds + tall_xcds = GRID_MN % NUM_XCDS + tall_xcds = NUM_XCDS if tall_xcds == 0 else tall_xcds + # Compute current XCD and local pid within the XCD + xcd = pid % NUM_XCDS + local_pid = pid // NUM_XCDS + # Calculate new pid based on the new grouping + # Note that we need to consider the following two cases: + # 1. the current pid is on a tall xcd + # 2. the current pid is on a short xcd + if xcd < tall_xcds: + pid = xcd * pids_per_xcd + local_pid + else: + pid = ( + tall_xcds * pids_per_xcd + + (xcd - tall_xcds) * (pids_per_xcd - 1) + + local_pid + ) + + return pid + + +def _fused_moe_kernel_gptq_awq_repr(specialization): + if distinguish_moe_kernel_name(): + constants = specialization.constants + mul_routed_weight = constants.get('MUL_ROUTED_WEIGHT', False) + return "fused_moe_kernel_gptq_awq_bot" if mul_routed_weight else "fused_moe_kernel_gptq_awq" + else: + return "fused_moe_kernel_gptq_awq" + + +@triton.heuristics(values={ + 'block_k_diviable': lambda nargs: nargs['K'] % (nargs['BLOCK_SIZE_K']) == 0, + 'block_n_diviable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, + "num_groups": lambda nargs: triton.cdiv(nargs["BLOCK_SIZE_K"], nargs["group_size"]), + 'group_size_divisible': lambda nargs: nargs['BLOCK_SIZE_K'] % nargs['group_size'] == 0, +}) +@triton.jit(repr=_fused_moe_kernel_gptq_awq_repr) +def fused_moe_kernel_gptq_awq( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + b_scale_ptr, + b_zp_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + sorted_weights_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_bse, + stride_bsk, + stride_bsn, + stride_bze, + stride_bzk, + stride_bzn, + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + group_size: tl.constexpr, + group_size_divisible: tl.constexpr, + num_groups: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + USE_MLS_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + has_zp: tl.constexpr, + use_int4_w4a16: tl.constexpr, + use_int8_w8a16: tl.constexpr, + ck_sorting: tl.constexpr, + ck_topk: tl.constexpr, + NUM_XCDS: tl.constexpr): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsk >= 0) + tl.assume(stride_bsn >= 0) + tl.assume(stride_bze >= 0) + tl.assume(stride_bzk >= 0) + tl.assume(stride_bzn >= 0) + + # to notify the compiler that sorted_token_ids_ptr is a pointer to the memory, + # and all value in the memory is non-negative. + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + tl.static_assert(COMBINE_SCALE_LOAD == False, "COMBINE_SCALE_LOAD not support for awq!") + tl.static_assert(USE_MLS_LOAD == False, "USE_MLS_LOAD not support yet for awq!") + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + offs_token_id = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)).to(tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + if ck_sorting: + token_id = (offs_token & 0x00FFFFFF) + topk_id = (offs_token >> 24) & 0xFF + offs_token = token_id * ck_topk + topk_id + + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + + # c_ptr will be a zero inited buffer, no need to write zero explicitly + # write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, + # offs_token, token_mask, BLOCK_SIZE_M, + # BLOCK_SIZE_N, compute_type) + return + + tl.assume(off_experts >= 0) + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N)).to(tl.int32) % N + offs_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int32) + + + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + (offs_token[:, None].to(tl.int64) // top_k * stride_am + + offs_k[None, :].to(tl.int64) * stride_ak) + else: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int32) + + if use_int4_w4a16: + if group_size_divisible and has_zp: + offs_k_continue = tl.arange(0, BLOCK_SIZE_K // 2).to(tl.int32) + b_ptrs = b_ptr + (off_experts * stride_be + \ + offs_bn[:, None] * stride_bn + offs_k_continue[None, :] * \ + stride_bk).to(tl.int32) + else: + b_ptrs = b_ptr + (off_experts * stride_be + \ + (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * \ + stride_bn).to(tl.int32) + b_shifter = (offs_k[:, None] % 2) * 4 + elif use_int8_w8a16: + b_ptrs = b_ptr + (off_experts * stride_be + \ + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn).to(tl.int32) + + if not has_zp and use_int4_w4a16: + b_zp_num = 8 + if not has_zp and use_int8_w8a16: + b_zp_num = 128 + elif has_zp and use_int4_w4a16: + b_zp_shifter = (offs_bn[None, :] % 2) * 4 + + # w4a16 deepseek case + if use_int4_w4a16 and (group_size_divisible and has_zp): + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + offs_szk = k * num_groups + tl.arange(0, num_groups) + + if not block_k_diviable: + k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K + k_other = 0.0 + + masks_szk = offs_szk < K // group_size + masks_a = token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K) + else: + k_mask = None + k_other = None + + masks_szk = None + masks_a = token_mask[:, None] + + a = tl.load(a_ptrs, + mask=masks_a, + other=0.0) + b = tl.load(b_ptrs) + if use_int4_w4a16: + b = tl.interleave(b, b) + b = tl.trans(b) + + b = (b >> b_shifter) & 0xF + + b_scale_ptrs = b_scale_ptr + (off_experts * stride_bse + \ + offs_bn[None, :] * stride_bsn + \ + offs_szk[:, None] * stride_bsk).to(tl.int32) + b_scale = tl.load(b_scale_ptrs, mask=masks_szk, other=k_other) + b_scale = b_scale.to(tl.float32) + + b_zp_ptrs = b_zp_ptr + (off_experts * stride_bze + \ + (offs_bn[None, :]//2) * stride_bzn + \ + offs_szk[:, None] * stride_bzk).to(tl.int32) + b_zp = tl.load(b_zp_ptrs, mask=masks_szk, other=k_other) + b_zp = ((b_zp >> b_zp_shifter) & 0xF) + b_zp = b_zp.to(tl.float32) + + + if num_groups == 1: + # Original efficient implementation for single group + b_scale = tl.broadcast_to(b_scale, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + b_zp = tl.broadcast_to(b_zp, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + else: + # Reshape to (num_groups, 1, N) then broadcast to (num_groups, group_size_in_block, N) + b_scale = tl.broadcast_to(b_scale[:, None, :], (num_groups, group_size, BLOCK_SIZE_N)) + b_scale = tl.reshape(b_scale, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + b_zp = tl.broadcast_to(b_zp[:, None, :], (num_groups, group_size, BLOCK_SIZE_N)) + b_zp = tl.reshape(b_zp, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + # We accumulate along the K dimension. + if has_zp: + b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type) + else: + b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type) + + accumulator = tl.dot(a, b, acc=accumulator) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + if use_int4_w4a16: + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + else: + b_ptrs += BLOCK_SIZE_K * stride_bk + else: + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + + if not block_k_diviable: + k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K + k_other = 0.0 + else: + k_mask = None + k_other = None + + a = tl.load(a_ptrs, + mask=token_mask[:, None] & + (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs) + if use_int4_w4a16: + b = (b >> b_shifter) & 0xF + + b_scale_ptrs = b_scale_ptr + (off_experts * stride_bse + \ + offs_bn[None, :] * stride_bsn + \ + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * \ + stride_bsk).to(tl.int32) + b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other) + b_scale = b_scale.to(tl.float32) + + if has_zp and use_int4_w4a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = b_zp_ptr + (off_experts * stride_bze + \ + (offs_bn[None, :] // 2) * stride_bzn + \ + offs_k_true * stride_bzk).to(tl.int32) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = ((b_zp >> b_zp_shifter) & 0xF) + b_zp = b_zp.to(tl.float32) + elif has_zp and use_int8_w8a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = b_zp_ptr + (off_experts * stride_bze + \ + offs_bn[None, :] * stride_bzn + \ + offs_k_true * stride_bzk).to(tl.int32) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = b_zp.to(tl.float32) + + # We accumulate along the K dimension. + if has_zp: + b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type) + else: + b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type) + + accumulator = tl.dot(a, b, acc=accumulator) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + if use_int4_w4a16: + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + else: + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + if USE_ADDR_OFFSET_INT64_C: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + + if block_n_diviable: + c_mask = token_mask[:, None] + else: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def _fused_moe_kernel_gptq_awq_w4a8_repr(specialization): + if distinguish_moe_kernel_name(): + constants = specialization.constants + mul_routed_weight = constants.get('MUL_ROUTED_WEIGHT', False) + return "fused_moe_kernel_gptq_awq_w4a8_bot" if mul_routed_weight else "fused_moe_kernel_gptq_awq_w4a8" + else: + return "fused_moe_kernel_gptq_awq_w4a8" + + +@triton.heuristics(values={ + 'block_k_diviable': lambda nargs: nargs['K'] % (nargs['BLOCK_SIZE_K']) == 0, + 'block_n_diviable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, + "num_groups": lambda nargs: triton.cdiv(nargs["BLOCK_SIZE_K"], nargs["group_size"]), + 'group_size_divisible': lambda nargs: nargs['BLOCK_SIZE_K'] % nargs['group_size'] == 0, +}) +@triton.jit(repr=_fused_moe_kernel_gptq_awq_w4a8_repr) +def fused_moe_kernel_gptq_awq_w4a8( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + sorted_weights_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + stride_bze, + stride_bzk, + stride_bzn, + group_k: tl.constexpr, # a quant group size: ie. 128 + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + group_size: tl.constexpr, # b w4 group size: ie. 64 + group_size_divisible: tl.constexpr, + num_groups: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + USE_MLS_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + has_zp: tl.constexpr, + use_int4_w4a16: tl.constexpr, + use_int4_w4a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + ck_sorting: tl.constexpr, + ck_topk: tl.constexpr, + NUM_XCDS: tl.constexpr): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_asm >= 0) + tl.assume(stride_ask >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsk >= 0) + tl.assume(stride_bsn >= 0) + tl.assume(stride_bze >= 0) + tl.assume(stride_bzk >= 0) + tl.assume(stride_bzn >= 0) + + # to notify the compiler that sorted_token_ids_ptr is a pointer to the memory, + # and all value in the memory is non-negative. + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + tl.static_assert(use_int4_w4a8 == True, "Must use int4_w4a8") + tl.static_assert(has_zp == True, "only for deepseek w4a8 case.") + tl.static_assert(group_k > 0, "group_k must be greater than 0 for deepseek w4a8 case.") + tl.static_assert(block_k_diviable == True, "block_k_diviable must be True for deepseek w4a8 case.") + tl.static_assert(BLOCK_SIZE_K <= group_k and group_k % BLOCK_SIZE_K == 0, + "BLOCK_SIZE_K must be divisible by GROUP_SIZE_K") + tl.static_assert(group_size_divisible == True, "group_size_divisible must be True for deepseek w4a8 case.") + tl.static_assert(BLOCK_SIZE_K == group_size, "BLOCK_SIZE_K must be equal to group_size for deepseek w4a8 case.") + tl.static_assert(group_size == group_k, "group_size must be equal to group_k for deepseek w4a8 case.") + tl.static_assert(USE_MLS_LOAD == False, "USE_MLS_LOAD must be False due to not supported yet.") + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + offs_token_id = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)).to(tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + if ck_sorting: + token_id = (offs_token & 0x00FFFFFF) + topk_id = (offs_token >> 24) & 0xFF + offs_token = token_id * ck_topk + topk_id + + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + # c_ptr will be a zero inited buffer, no need to write zero explicitly + # write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, + # offs_token, token_mask, BLOCK_SIZE_M, + # BLOCK_SIZE_N, compute_type) + return + + tl.assume(off_experts >= 0) + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N)).to(tl.int32) % N + offs_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int32) + + + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + (offs_token[:, None].to(tl.int64) // top_k * stride_am + + offs_k[None, :].to(tl.int64) * stride_ak) + else: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int32) + + offs_k_continue = tl.arange(0, BLOCK_SIZE_K // 2).to(tl.int32) + b_ptrs = b_ptr + (off_experts * stride_be + \ + offs_bn[:, None] * stride_bn + offs_k_continue[None, :] * \ + stride_bk).to(tl.int32) #[N, K//2] + b_shifter = (offs_k[:, None] % 2) * 4 #[K] + + b_zp_shifter = (offs_bn[None, :] % 2) * 4 + + if COMBINE_SCALE_LOAD: + a_scale_ptrs = a_scale_ptr + (offs_token[:, None] // top_k) * stride_asm + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + + # w4a8 deepseek case + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + if COMBINE_SCALE_LOAD: + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + offs_szk = k + tl.arange(0, 2) + + a0 = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b0 = tl.load(b_ptrs) + + b0 = tl.interleave(b0, b0) #[N, K] + b0 = tl.trans(b0) #[K, N] + b0 = (b0 >> b_shifter) & 0xF + b0 = b0.to(tl.int32) + + offs_ks = k + tl.arange(0, 2) + a_scale = tl.load(a_scale_ptrs + offs_ks[None, :] * stride_ask, + mask=token_mask[:, None], + other=0.0) # [M, 2] + a_scale0, a_scale1 = tl.split(a_scale) # [M] + + # b_scale shape: [N, K] = [N, 2] + b_scale_ptrs = b_scale_ptr + (off_experts * stride_bse + \ + offs_bn[:, None] * stride_bsn + \ + offs_szk[None, :] * stride_bsk).to(tl.int32) + b_scale = tl.load(b_scale_ptrs) + b_scale = b_scale.to(tl.float32) + b_scale0, b_scale1 = tl.split(b_scale) # [N] + + + b_zp_ptrs = b_zp_ptr + (off_experts * stride_bze + \ + (offs_bn[:, None]//2) * stride_bzn + \ + offs_szk[None, :] * stride_bzk).to(tl.int32) #[N, 2] + b_zp = tl.load(b_zp_ptrs) + b_zp0, b_zp1 = tl.split(b_zp) # [N] + + b_zp0 = ((b_zp0 >> b_zp_shifter) & 0xF) + b_zp1 = ((b_zp1 >> b_zp_shifter) & 0xF) + b_zp0 = b_zp0.to(tl.int32) + b_zp1 = b_zp1.to(tl.int32) + + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, mask=token_mask[:, None], other=0.0) + b1 = tl.load(b_ptrs + (BLOCK_SIZE_K // 2) * stride_bk) + b1 = tl.interleave(b1, b1) + b1 = tl.trans(b1) + b1 = (b1 >> b_shifter) & 0xF + b1 = b1.to(tl.int32) + + + b_scale0 = tl.reshape(b_scale0, (1, BLOCK_SIZE_N)) + b_scale1 = tl.reshape(b_scale1, (1, BLOCK_SIZE_N)) + b_scale0 = tl.broadcast_to(b_scale0, (BLOCK_SIZE_M, BLOCK_SIZE_N)) + b_scale1 = tl.broadcast_to(b_scale1, (BLOCK_SIZE_M, BLOCK_SIZE_N)) + + b_zp0 = tl.reshape(b_zp0, (1, BLOCK_SIZE_N)) + b_zp1 = tl.reshape(b_zp1, (1, BLOCK_SIZE_N)) + b_zp0 = tl.broadcast_to(b_zp0, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + b_zp1 = tl.broadcast_to(b_zp1, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + + # We accumulate along the K dimension. + b0 = (b0 - b_zp0).to(tl.int8) + # accumulator += tl.dot(a0, b0) * a_scale0[:, None] * b_scale0[None, :] + accumulator += tl.dot(a0, b0) * a_scale0[:, None] * b_scale0 + + # We accumulate along the K dimension. + b1 = (b1 - b_zp1).to(tl.int8) + accumulator += tl.dot(a1, b1) * a_scale1[:, None] * b_scale1 + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk * 2 + else: + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + tl.static_assert(num_groups == 1, "num_groups must be 1") + offs_szk = k * num_groups + + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + + b = tl.interleave(b, b) + b = tl.trans(b) + b = (b >> b_shifter) & 0xF + b = b.to(tl.int32) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, + mask=token_mask, + other=0.0) + + b_scale_ptrs = b_scale_ptr + (off_experts * stride_bse + \ + offs_bn[None, :] * stride_bsn + \ + offs_szk * stride_bsk).to(tl.int32) + b_scale = tl.load(b_scale_ptrs) + b_scale = b_scale.to(tl.float32) + + b_zp_ptrs = b_zp_ptr + (off_experts * stride_bze + \ + (offs_bn[None, :]//2) * stride_bzn + \ + offs_szk[:, None] * stride_bzk).to(tl.int32) + b_zp = tl.load(b_zp_ptrs) + b_zp = ((b_zp >> b_zp_shifter) & 0xF) + b_zp = b_zp.to(tl.int32) + + + b_scale = tl.broadcast_to(b_scale, (BLOCK_SIZE_M, BLOCK_SIZE_N)) + b_zp = tl.broadcast_to(b_zp, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + # We accumulate along the K dimension. + b = (b - b_zp).to(tl.int8) + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + if USE_ADDR_OFFSET_INT64_C: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + + if block_n_diviable: + c_mask = token_mask[:, None] + else: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def _fused_moe_kernel_gptq_awq_w4a8_channelwise_repr(specialization): + if distinguish_moe_kernel_name(): + constants = specialization.constants + mul_routed_weight = constants.get('MUL_ROUTED_WEIGHT', False) + return ( + "fused_moe_kernel_gptq_awq_w4a8_channelwise_bot" + if mul_routed_weight else + "fused_moe_kernel_gptq_awq_w4a8_channelwise" + ) + else: + return "fused_moe_kernel_gptq_awq_w4a8_channelwise" + + +@triton.heuristics(values={ + 'block_k_diviable': lambda nargs: nargs['K'] % (nargs['BLOCK_SIZE_K']) == 0, + 'block_n_diviable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, +}) +@triton.jit(repr=_fused_moe_kernel_gptq_awq_w4a8_channelwise_repr) +def fused_moe_kernel_gptq_awq_w4a8_channelwise( + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + sorted_weights_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsn, + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + USE_MLS_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_B: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_int4_w4a8: tl.constexpr, + ck_sorting: tl.constexpr, + ck_topk: tl.constexpr, + NUM_XCDS: tl.constexpr): + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_asm >= 0) + tl.assume(stride_ask >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsn >= 0) + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + tl.static_assert(use_int4_w4a8 == True, "Must use int4_w4a8") + tl.static_assert(BLOCK_SIZE_K % 2 == 0, "BLOCK_SIZE_K must be even for packed int4 weights.") + tl.static_assert(COMBINE_SCALE_LOAD == False, "Channel-wise w4a8 does not use combined scale loads.") + tl.static_assert(USE_MLS_LOAD == False, "Channel-wise w4a8 does not support MLS loads.") + + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + offs_token_id = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)).to(tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + if ck_sorting: + token_id = (offs_token & 0x00FFFFFF) + topk_id = (offs_token >> 24) & 0xFF + offs_token = token_id * ck_topk + topk_id + + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m) + if off_experts == -1: + return + + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)).to(tl.int32) % N + offs_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int32) + offs_k_continue = tl.arange(0, BLOCK_SIZE_K // 2).to(tl.int32) + + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + ( + offs_token[:, None].to(tl.int64) // top_k * stride_am + + offs_k[None, :].to(tl.int64) * stride_ak + ) + else: + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak + ).to(tl.int32) + + if USE_ADDR_OFFSET_INT64_B: + b_ptrs = b_ptr + ( + off_experts.to(tl.int64) * stride_be + + offs_bn[:, None].to(tl.int64) * stride_bn + + offs_k_continue[None, :].to(tl.int64) * stride_bk + ) + else: + b_ptrs = b_ptr + ( + off_experts * stride_be + + offs_bn[:, None] * stride_bn + + offs_k_continue[None, :] * stride_bk + ).to(tl.int32) + b_shifter = (offs_k[:, None] % 2) * 4 + + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + b_scale_ptrs = b_scale_ptr + (off_experts * stride_bse + offs_bn[None, :] * stride_bsn).to(tl.int32) + b_scale = tl.load(b_scale_ptrs).to(tl.float32) + + a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0).to(tl.float32) + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + remaining_k = K - k * BLOCK_SIZE_K + if not block_k_diviable: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < remaining_k), + other=0.0, + ) + b = tl.load( + b_ptrs, + mask=offs_k_continue[None, :] < tl.cdiv(remaining_k, 2), + other=0, + ) + else: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + + b = tl.interleave(b, b) + b = tl.trans(b) + b = ((b >> b_shifter) & 0xF).to(tl.int32) + b = tl.where(b >= 8, b - 16, b).to(tl.int8) + + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale + + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + if USE_ADDR_OFFSET_INT64_C: + c_ptrs = c_ptr + ( + stride_cm * offs_token[:, None].to(tl.int64) + + stride_cn * offs_cn[None, :].to(tl.int64) + ) + else: + c_ptrs = c_ptr + ( + stride_cm * offs_token[:, None] + + stride_cn * offs_cn[None, :] + ).to(tl.int32) + + if block_n_diviable: + c_mask = token_mask[:, None] + else: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def _fused_moe_kernel_repr(specialization): + if distinguish_moe_kernel_name(): + constants = specialization.constants + mul_routed_weight = constants.get('MUL_ROUTED_WEIGHT', False) + return "fused_moe_kernel_bot" if mul_routed_weight else "fused_moe_kernel" + else: + return "fused_moe_kernel" + + +@triton.heuristics(values={ + 'block_k_diviable': lambda nargs: nargs['K'] % (nargs['BLOCK_SIZE_K']) == 0, + 'block_n_diviable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, +}) +@triton.jit(repr=_fused_moe_kernel_repr) +def fused_moe_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + topk_weights_ptr, + num_tokens_post_padded_ptr, + expert_ids_ptr, + sorted_token_ids_ptr, + sorted_weights_ptr, + a_scale_ptr, + b_scale_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + total_tokens, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + USE_MLS_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_channel_quant: tl.constexpr, + c_sorted: tl.constexpr, + bottom_a_use_mls_load: tl.constexpr, + ck_sorting: tl.constexpr, + ck_topk: tl.constexpr, + NUM_XCDS: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsk >= 0) + tl.assume(stride_bsn >= 0) + + # to notify the compiler that sorted_token_ids_ptr is a pointer to the memory, + # and all value in the memory is non-negative. + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + if group_k > 0: + tl.static_assert(BLOCK_SIZE_K <= group_k and group_k % BLOCK_SIZE_K == 0, + "BLOCK_SIZE_K must be divisible by GROUP_SIZE_K") + if COMBINE_SCALE_LOAD: # used for use_int8_w8a8 + tl.static_assert(stride_ask == 1, + "COMBINE_SCALE_LOAD implictly stride_ask == 1!") + tl.static_assert(MUL_ROUTED_WEIGHT == False, + "COMBINE_SCALE_LOAD and MUL_ROUTED_WEIGHT cannot be both true due to w1_scale and w2_scale diff layout!") + tl.static_assert(block_k_diviable == True and BLOCK_SIZE_K == group_k, + "COMBINE_SCALE_LOAD only add and verify on block_k_diviable!") + tl.static_assert(use_int8_w8a8 or use_fp8_w8a8, + "COMBINE_SCALE_LOAD only add and verify on use_int8_w8a8 or use_fp8_w8a8!") + if USE_MLS_LOAD: + tl.static_assert(block_k_diviable == True and block_n_diviable == True, + "USE_MLS_LOAD must require block_k_diviable and block_n_diviable(maybe exceed 2M Page)!") + if bottom_a_use_mls_load: + tl.static_assert(MUL_ROUTED_WEIGHT == True and c_sorted == False, + "bottom_a_use_mls_load true must when MUL_ROUTED_WEIGHT == True and c_sorted == False!") + + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int32) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + # c_ptr will be a zero inited buffer, no need to write zero explicitly + # write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, + # offs_token, token_mask, BLOCK_SIZE_M, + # BLOCK_SIZE_N, compute_type) + return + + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + if ck_sorting: + token_id = (offs_token & 0x00FFFFFF) + topk_id = (offs_token >> 24) & 0xFF + offs_token = token_id * ck_topk + topk_id + + token_mask = offs_token < num_valid_tokens + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)) % N + + offs_k = tl.arange(0, BLOCK_SIZE_K) + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + (offs_token[:, None].to(tl.int64) // top_k * stride_am + + offs_k[None, :].to(tl.int64) * stride_ak) + else: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int32) + + b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn).to(tl.int32) + + if use_int8_w8a16: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8 or use_int8_w8a8: + # block-wise + if group_k > 0 and group_n > 0: + if COMBINE_SCALE_LOAD: + a_scale_ptrs = a_scale_ptr + (offs_token[:, None] // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn[:, None] * stride_bsn) + else: + if bottom_a_use_mls_load: # top_k is 1 + a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn * stride_bsn) + # channel-wise + elif per_channel_quant: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + # Load per-token scale for activations + if bottom_a_use_mls_load: # top_k is 1 and A is laid out in sorted MLS order + a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, + None] + # tensor-wise + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + mls_offs_k = 0 + if COMBINE_SCALE_LOAD: + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a0 = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b0 = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + else: + a0 = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + if not USE_MLS_LOAD: + b0 = tl.load(b_ptrs) + else: + b0 = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a0, b0.to(compute_type), acc=accumulator) + tl.static_assert(False, "Not implemented") + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + tl.arange(0, 2) + a_scale = tl.load(a_scale_ptrs + offs_ks[None, :] * stride_ask, + mask=token_mask[:, None], + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks[None, :] * stride_bsk) + a_scale_0, a_scale_1 = tl.split(a_scale) + b_scale_0, b_scale_1 = tl.split(b_scale) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a0, b0) * a_scale_0[:, None] * b_scale_0[None, :] + else: + accumulator += tl.dot(a0, b0) * (a_scale_0[:, None] * b_scale_0) + + if not block_k_diviable: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, + mask=token_mask[:, None] & (offs_k[None, :] < K - (k + 1) * BLOCK_SIZE_K), + other=0.0) + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk, + mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0) + else: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, mask=token_mask[:, None], other=0.0) + if not USE_MLS_LOAD: + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk) + else: + b1 = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k + BLOCK_SIZE_K, (pid_n * BLOCK_SIZE_N) % N]) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a1, b1) * a_scale_1[:, None] * b_scale_1[None, :] + else: + accumulator += tl.dot(a1, b1) * (a_scale_1[:, None] * b_scale_1) + else: + if use_fp8_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + tl.static_assert(False, "Not implemented") + else: + accumulator += tl.dot(a, b) + tl.static_assert(False, "Not implemented") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += BLOCK_SIZE_K * stride_bk * 2 + mls_offs_k += BLOCK_SIZE_K * 2 + + else: # non-COMBINE_SCALE_LOAD + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + else: + if not bottom_a_use_mls_load: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + else: + a = tl.matrix_load( + a_ptr, + shape=[total_tokens, K], + strides=[stride_am, stride_ak], + block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_K], + offsets=[pid_m * BLOCK_SIZE_M, mls_offs_k]) + # mask_a_mls = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) < total_tokens + # a = tl.where(mask_a_mls[:, None], a, 0) + + if not USE_MLS_LOAD: + b = tl.load(b_ptrs) + else: + b = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, + mask=token_mask, + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + accumulator += tl.dot(a, b) * (a_scale[:, None] * b_scale) + else: + if use_fp8_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + mls_offs_k += BLOCK_SIZE_K + + if MUL_ROUTED_WEIGHT: + # Both of them can work well. + if ck_sorting: + moe_weight = tl.load(sorted_weights_ptr + offs_token_id, + mask=token_mask, + other=0) + else: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + if USE_ADDR_OFFSET_INT64_C: + if c_sorted: + c_ptrs = c_ptr + (stride_cm * offs_token_id[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + if c_sorted: + c_ptrs = c_ptr + (stride_cm * offs_token_id[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + if not block_n_diviable: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + else: + c_mask = token_mask[:, None] + + tl.store(c_ptrs, accumulator, mask=c_mask) + + + +def _fused_moe_splitk_kernel_repr(specialization): + if distinguish_moe_kernel_name(): + constants = specialization.constants + mul_routed_weight = constants.get('MUL_ROUTED_WEIGHT', False) + return "fused_moe_splitk_kernel_bot" if mul_routed_weight else "fused_moe_splitk_kernel" + else: + return "fused_moe_splitk_kernel" + + +@triton.heuristics(values={ + 'block_k_diviable': lambda nargs: nargs['K'] % (nargs['BLOCK_SIZE_K']) == 0, + 'block_n_diviable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, + 'k_per_split': lambda nargs: nargs['K'] // nargs['SPLIT_K'], +}) +@triton.jit(repr=_fused_moe_splitk_kernel_repr) +def fused_moe_splitk_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + topk_weights_ptr, + num_tokens_post_padded_ptr, + expert_ids_ptr, + sorted_token_ids_ptr, + sorted_weights_ptr, + a_scale_ptr, + b_scale_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_ck, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + total_tokens, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + USE_MLS_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + SPLIT_K: tl.constexpr, + k_per_split: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_channel_quant: tl.constexpr, + c_sorted: tl.constexpr, + bottom_a_use_mls_load: tl.constexpr, + ck_sorting: tl.constexpr, + ck_topk: tl.constexpr, + NUM_XCDS: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_ck >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsk >= 0) + tl.assume(stride_bsn >= 0) + + # to notify the compiler that sorted_token_ids_ptr is a pointer to the memory, + # and all value in the memory is non-negative. + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + if group_k > 0: + tl.static_assert(BLOCK_SIZE_K <= group_k and group_k % BLOCK_SIZE_K == 0, + "BLOCK_SIZE_K must be divisible by GROUP_SIZE_K") + if COMBINE_SCALE_LOAD: # used for use_int8_w8a8 + tl.static_assert(stride_ask == 1, + "COMBINE_SCALE_LOAD implictly stride_ask == 1!") + tl.static_assert(MUL_ROUTED_WEIGHT == False, + "COMBINE_SCALE_LOAD and MUL_ROUTED_WEIGHT cannot be both true due to w1_scale and w2_scale diff layout!") + tl.static_assert(block_k_diviable == True and BLOCK_SIZE_K == group_k, + "COMBINE_SCALE_LOAD only add and verify on block_k_diviable!") + tl.static_assert(use_int8_w8a8 or use_fp8_w8a8, + "COMBINE_SCALE_LOAD only add and verify on use_int8_w8a8 or use_fp8_w8a8!") + if USE_MLS_LOAD: + tl.static_assert(block_k_diviable == True and block_n_diviable == True, + "USE_MLS_LOAD must require block_k_diviable and block_n_diviable(maybe exceed 2M Page)!") + if bottom_a_use_mls_load: + tl.static_assert(MUL_ROUTED_WEIGHT == True and c_sorted == False, + "bottom_a_use_mls_load true must when MUL_ROUTED_WEIGHT == True and c_sorted == False!") + + if SPLIT_K != 1: + tl.static_assert((use_int8_w8a8 == False and use_int8_w8a16 == False) and use_fp8_w8a8 == False, + "SPLIT_K only add and verify on use_int8_w8a8 == False and use_fp8_w8a8 == False and use_int8_w8a16 == False!") + tl.static_assert(MUL_ROUTED_WEIGHT == False, + "SPLIT_K can only work on gemm1 case!") + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + splitk_idx = 0 if SPLIT_K == 1 else tl.program_id(axis=1) + + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + + k_start = splitk_idx * k_per_split + k_end = k_start + k_per_split + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int32) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + # c_ptr will be a zero inited buffer, no need to write zero explicitly + # write_zeros_to_output(c_ptr, pid_k, stride_ck, stride_cm, stride_cn, pid_n, N, + # offs_token, token_mask, BLOCK_SIZE_M, + # BLOCK_SIZE_N, compute_type) + return + + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + if ck_sorting: + token_id = (offs_token & 0x00FFFFFF) + topk_id = (offs_token >> 24) & 0xFF + offs_token = token_id * ck_topk + topk_id + + token_mask = offs_token < num_valid_tokens + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)) % N + + offs_k = tl.arange(0, BLOCK_SIZE_K) + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + (offs_token[:, None].to(tl.int64) // top_k * stride_am + + (offs_k[None, :] + k_start).to(tl.int64) * stride_ak) + else: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + (offs_k[None, :] + k_start) * stride_ak).to(tl.int32) + + b_ptrs = b_ptr + off_experts * stride_be + ((offs_k[:, None] + k_start) * stride_bk + + offs_bn[None, :] * stride_bn).to(tl.int32) + + if use_int8_w8a16: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8 or use_int8_w8a8: + # block-wise + if group_k > 0 and group_n > 0: + if COMBINE_SCALE_LOAD: + a_scale_ptrs = a_scale_ptr + (offs_token[:, None] // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn[:, None] * stride_bsn) + else: + if bottom_a_use_mls_load: # top_k is 1 + a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn * stride_bsn) + # channel-wise + elif per_channel_quant: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + # Load per-token scale for activations + if bottom_a_use_mls_load: # top_k is 1 and A is laid out in sorted MLS order + a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, + None] + # tensor-wise + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + mls_offs_k = k_start + if COMBINE_SCALE_LOAD: + tl.static_assert(SPLIT_K == 1, "COMBINE_SCALE_LOAD only add and verify on SPLIT_K == 1!") + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a0 = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b0 = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + else: + a0 = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + if not USE_MLS_LOAD: + b0 = tl.load(b_ptrs) + else: + b0 = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a0, b0.to(compute_type), acc=accumulator) + tl.static_assert(False, "Not implemented") + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + tl.arange(0, 2) + a_scale = tl.load(a_scale_ptrs + offs_ks[None, :] * stride_ask, + mask=token_mask[:, None], + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks[None, :] * stride_bsk) + a_scale_0, a_scale_1 = tl.split(a_scale) + b_scale_0, b_scale_1 = tl.split(b_scale) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a0, b0) * a_scale_0[:, None] * b_scale_0[None, :] + else: + accumulator += tl.dot(a0, b0) * (a_scale_0[:, None] * b_scale_0) + + if not block_k_diviable: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, + mask=token_mask[:, None] & (offs_k[None, :] < K - (k + 1) * BLOCK_SIZE_K), + other=0.0) + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk, + mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0) + else: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, mask=token_mask[:, None], other=0.0) + if not USE_MLS_LOAD: + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk) + else: + b1 = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k + BLOCK_SIZE_K, (pid_n * BLOCK_SIZE_N) % N]) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a1, b1) * a_scale_1[:, None] * b_scale_1[None, :] + else: + accumulator += tl.dot(a1, b1) * (a_scale_1[:, None] * b_scale_1) + else: + if use_fp8_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + tl.static_assert(False, "Not implemented") + else: + accumulator += tl.dot(a, b) + tl.static_assert(False, "Not implemented") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += BLOCK_SIZE_K * stride_bk * 2 + mls_offs_k += BLOCK_SIZE_K * 2 + + else: # non-COMBINE_SCALE_LOAD + + for k in range(0, tl.cdiv(k_per_split, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < (k_per_split) - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < (k_per_split) - k * BLOCK_SIZE_K, other=0.0) + else: + if not bottom_a_use_mls_load: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + else: + a = tl.matrix_load( + a_ptr, + shape=[total_tokens, K], + strides=[stride_am, stride_ak], + block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_K], + offsets=[pid_m * BLOCK_SIZE_M, mls_offs_k]) + + if not USE_MLS_LOAD: + b = tl.load(b_ptrs) + else: + b = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, + mask=token_mask, + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + accumulator += tl.dot(a, b) * (a_scale[:, None] * b_scale) + else: + if use_fp8_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + mls_offs_k += BLOCK_SIZE_K + + if MUL_ROUTED_WEIGHT: + # Both of them can work well. + if ck_sorting: + moe_weight = tl.load(sorted_weights_ptr + offs_token_id, + mask=token_mask, + other=0) + else: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + if USE_ADDR_OFFSET_INT64_C: + if c_sorted: + c_ptrs = c_ptr + (stride_ck * splitk_idx.to(tl.int64) + stride_cm * offs_token_id[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + c_ptrs = c_ptr + (stride_ck * splitk_idx.to(tl.int64) + stride_cm * offs_token[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + if c_sorted: + c_ptrs = c_ptr + (stride_ck * splitk_idx + stride_cm * offs_token_id[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + else: + c_ptrs = c_ptr + (stride_ck * splitk_idx + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + if not block_n_diviable: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + else: + c_mask = token_mask[:, None] + + tl.store(c_ptrs, accumulator, mask=c_mask) + + + +def _fused_moe_persistent_kernel_repr(specialization): + if distinguish_moe_kernel_name(): + constants = specialization.constants + mul_routed_weight = constants.get('MUL_ROUTED_WEIGHT', False) + return "fused_moe_persistent_kernel_bot" if mul_routed_weight else "fused_moe_persistent_kernel" + else: + return "fused_moe_persistent_kernel" + + +@triton.heuristics(values={ + 'block_k_diviable': lambda nargs: nargs['K'] % (nargs['BLOCK_SIZE_K']) == 0, + 'block_n_diviable': lambda nargs: nargs['N'] % nargs['BLOCK_SIZE_N'] == 0, +}) +@triton.jit(repr=_fused_moe_persistent_kernel_repr) +def fused_moe_persistent_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + topk_weights_ptr, + num_tokens_post_padded_ptr, + expert_ids_ptr, + sorted_token_ids_ptr, + sorted_weights_ptr, + a_scale_ptr, + b_scale_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + total_tokens, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + block_k_diviable: tl.constexpr, + block_n_diviable: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + COMBINE_SCALE_LOAD: tl.constexpr, + USE_MLS_LOAD: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + USE_ADDR_OFFSET_INT64_A: tl.constexpr, + USE_ADDR_OFFSET_INT64_C: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_channel_quant: tl.constexpr, + c_sorted: tl.constexpr, + bottom_a_use_mls_load: tl.constexpr, + ck_sorting: tl.constexpr, + ck_topk: tl.constexpr, + NUM_SMS: tl.constexpr, + NUM_XCDS: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + This is the persistent version of the fused_moe kernel. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + + tl.assume(stride_am >= 0) + tl.assume(stride_ak >= 0) + tl.assume(stride_be >= 0) + tl.assume(stride_bk >= 0) + tl.assume(stride_bn >= 0) + tl.assume(stride_cm >= 0) + tl.assume(stride_cn >= 0) + tl.assume(stride_bse >= 0) + tl.assume(stride_bsk >= 0) + tl.assume(stride_bsn >= 0) + + # to notify the compiler that sorted_token_ids_ptr is a pointer to the memory, + # and all value in the memory is non-negative. + tl.assume(sorted_token_ids_ptr.to(tl.int64) >= 0) + + if group_k > 0: + tl.static_assert(BLOCK_SIZE_K <= group_k and group_k % BLOCK_SIZE_K == 0, + "BLOCK_SIZE_K must be divisible by GROUP_SIZE_K") + if COMBINE_SCALE_LOAD: # used for use_int8_w8a8 + tl.static_assert(stride_ask == 1, + "COMBINE_SCALE_LOAD implictly stride_ask == 1!") + tl.static_assert(MUL_ROUTED_WEIGHT == False, + "COMBINE_SCALE_LOAD and MUL_ROUTED_WEIGHT cannot be both true due to w1_scale and w2_scale diff layout!") + tl.static_assert(block_k_diviable == True and BLOCK_SIZE_K == group_k, + "COMBINE_SCALE_LOAD only add and verify on block_k_diviable!") + tl.static_assert(use_int8_w8a8 or use_fp8_w8a8, + "COMBINE_SCALE_LOAD only add and verify on use_int8_w8a8 or use_fp8_w8a8!") + if USE_MLS_LOAD: + tl.static_assert(block_k_diviable == True and block_n_diviable == True, + "USE_MLS_LOAD must require block_k_diviable and block_n_diviable(maybe exceed 2M Page)!") + if bottom_a_use_mls_load: + tl.static_assert(MUL_ROUTED_WEIGHT == True and c_sorted == False, + "bottom_a_use_mls_load true must when MUL_ROUTED_WEIGHT == True and c_sorted == False!") + + + # ----------------------------------------------------------- + # Simply compute how many iterations each persistent block needs to do + start_pid = tl.program_id(axis=0) + + # Load tile-invariant runtime constant + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + tile_id = start_pid + + num_tiles = num_pid_m * num_pid_n + + # Compute how many tiles are outside the padding region + num_valid_tiles = tl.cdiv((num_tiles - tile_id), NUM_SMS) + + for _ in range(0, num_valid_tiles): + tile_id_remapped = remap_xcd(tile_id, num_tiles, NUM_XCDS) + pid_m, pid_n = pid_grid(tile_id_remapped, num_pid_m, num_pid_n, GROUP_SIZE_M) + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int32) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + # c_ptr will be a zero inited buffer, no need to write zero explicitly + # write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, + # offs_token, token_mask, BLOCK_SIZE_M, + # BLOCK_SIZE_N, compute_type) + pass + else: + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int32) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + if ck_sorting: + token_id = (offs_token & 0x00FFFFFF) + topk_id = (offs_token >> 24) & 0xFF + offs_token = token_id * ck_topk + topk_id + + token_mask = offs_token < num_valid_tokens + + + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)) % N + + offs_k = tl.arange(0, BLOCK_SIZE_K) + if USE_ADDR_OFFSET_INT64_A: + a_ptrs = a_ptr + (offs_token[:, None].to(tl.int64) // top_k * stride_am + + offs_k[None, :].to(tl.int64) * stride_ak) + else: + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + + offs_k[None, :] * stride_ak).to(tl.int32) + + b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn).to(tl.int32) + + if use_int8_w8a16: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8 or use_int8_w8a8: + # block-wise + if group_k > 0 and group_n > 0: + if COMBINE_SCALE_LOAD: + a_scale_ptrs = a_scale_ptr + (offs_token[:, None] // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn[:, None] * stride_bsn) + else: + if bottom_a_use_mls_load: # top_k is 1 + a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + + offs_bsn = offs_bn // group_n + b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse + + offs_bsn * stride_bsn) + # channel-wise + elif per_channel_quant: + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + # Load per-token scale for activations + if bottom_a_use_mls_load: # top_k is 1 and A is laid out in sorted MLS order + a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm + else: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, + None] + # tensor-wise + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + mls_offs_k = 0 + if COMBINE_SCALE_LOAD: + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K), 2): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a0 = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b0 = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + else: + a0 = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + if not USE_MLS_LOAD: + b0 = tl.load(b_ptrs) + else: + b0 = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a0, b0.to(compute_type), acc=accumulator) + tl.static_assert(False, "Not implemented") + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + tl.arange(0, 2) + a_scale = tl.load(a_scale_ptrs + offs_ks[None, :] * stride_ask, + mask=token_mask[:, None], + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks[None, :] * stride_bsk) + a_scale_0, a_scale_1 = tl.split(a_scale) + b_scale_0, b_scale_1 = tl.split(b_scale) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a0, b0) * a_scale_0[:, None] * b_scale_0[None, :] + else: + accumulator += tl.dot(a0, b0) * (a_scale_0[:, None] * b_scale_0) + + if not block_k_diviable: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, + mask=token_mask[:, None] & (offs_k[None, :] < K - (k + 1) * BLOCK_SIZE_K), + other=0.0) + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk, + mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0) + else: + a1 = tl.load(a_ptrs + BLOCK_SIZE_K * stride_ak, mask=token_mask[:, None], other=0.0) + if not USE_MLS_LOAD: + b1 = tl.load(b_ptrs + BLOCK_SIZE_K * stride_bk) + else: + b1 = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k + BLOCK_SIZE_K, (pid_n * BLOCK_SIZE_N) % N]) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a1, b1) * a_scale_1[:, None] * b_scale_1[None, :] + else: + accumulator += tl.dot(a1, b1) * (a_scale_1[:, None] * b_scale_1) + else: + if use_fp8_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + tl.static_assert(False, "Not implemented") + else: + accumulator += tl.dot(a, b) + tl.static_assert(False, "Not implemented") + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak * 2 + b_ptrs += BLOCK_SIZE_K * stride_bk * 2 + mls_offs_k += BLOCK_SIZE_K * 2 + + else: # non-COMBINE_SCALE_LOAD + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if not block_k_diviable: + a = tl.load(a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + else: + if not bottom_a_use_mls_load: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + else: + a = tl.matrix_load( + a_ptr, + shape=[total_tokens, K], + strides=[stride_am, stride_ak], + block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_K], + offsets=[pid_m * BLOCK_SIZE_M, mls_offs_k]) + # mask_a_mls = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) < total_tokens + # a = tl.where(mask_a_mls[:, None], a, 0) + + if not USE_MLS_LOAD: + b = tl.load(b_ptrs) + else: + b = tl.matrix_load( + b_ptr + off_experts * stride_be, + shape=[K, N], + strides=[stride_bk, stride_bn], + block_shape=[BLOCK_SIZE_K, BLOCK_SIZE_N], + offsets=[mls_offs_k, (pid_n * BLOCK_SIZE_N) % N]) + + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, + mask=token_mask, + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + if BLOCK_SIZE_N > group_n: + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + accumulator += tl.dot(a, b) * (a_scale[:, None] * b_scale) + else: + if use_fp8_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + mls_offs_k += BLOCK_SIZE_K + + if MUL_ROUTED_WEIGHT: + # Both of them can work well. + if ck_sorting: + moe_weight = tl.load(sorted_weights_ptr + offs_token_id, + mask=token_mask, + other=0) + else: + moe_weight = tl.load(topk_weights_ptr + offs_token, + mask=token_mask, + other=0) + accumulator = accumulator * moe_weight[:, None] + + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + if USE_ADDR_OFFSET_INT64_C: + if c_sorted: + c_ptrs = c_ptr + (stride_cm * offs_token_id[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None].to(tl.int64) + stride_cn * offs_cn[ + None, :].to(tl.int64)) + else: + if c_sorted: + c_ptrs = c_ptr + (stride_cm * offs_token_id[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + else: + c_ptrs = c_ptr + (stride_cm * offs_token[:, None] + stride_cn * offs_cn[ + None, :]).to(tl.int32) + if not block_n_diviable: + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + else: + c_mask = token_mask[:, None] + + tl.store(c_ptrs, accumulator, mask=c_mask) + + tile_id += NUM_SMS + + +_USE_MOE_PERSISTENT_KERNEL = False + +def moe_set_use_persistent_kernel(value: bool): + global _USE_MOE_PERSISTENT_KERNEL + _USE_MOE_PERSISTENT_KERNEL = value + +def fused_moe( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: Optional[torch.Tensor], + B_zp: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + sorted_weights: Optional[torch.Tensor], + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + compute_type: tl.dtype, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + use_int4_w4a8: bool = False, + use_mxfp4_w4a4: bool = False, + per_channel_quant: bool = False, + block_shape: Optional[List[int]] = None, + c_sorted: bool = False, + bottom_a_use_mls_load: bool = False, + ck_sorting: bool = False, + ck_topk: int = 8, + config: Optional[Dict[str, Any]] = None, +) -> None: + assert topk_weights is not None or not mul_routed_weight + assert topk_weights is None or topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + if use_fp8_w8a8 or use_int8_w8a8: + assert B_scale is not None + assert (block_shape is None + or triton.cdiv(B.size(-2), block_shape[0]) == B_scale.size(-2)) + assert (block_shape is None + or triton.cdiv(B.size(-1), block_shape[1]) == B_scale.size(-1)) + + elif use_int8_w8a16 or use_int4_w4a16 or use_int4_w4a8: + assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + + total_tokens = A.size(0) + num_tokens = topk_ids.numel() + + if config is None: + moe_config_func = get_optimal_moe_config_func( + A, B, topk_ids, + use_int8_w8a16=use_int8_w8a16, + use_int8_w8a8=use_int8_w8a8, + use_fp8_w8a8=use_fp8_w8a8, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + use_mxfp4_w4a4=use_mxfp4_w4a4, + block_shape=block_shape, + is_bottom=mul_routed_weight) + config = moe_config_func(M) + + if "USE_MLS_LOAD" not in config: + config["USE_MLS_LOAD"] = False + if config["USE_MLS_LOAD"] == True and capMLS == False: + logger.warning("USE_MLS_LOAD is not supported for this architecture!!!") + config["USE_MLS_LOAD"] = False + + EM = sorted_token_ids.size(0) + if A.size(0) < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.size(0), + A.size(0) * top_k * config['BLOCK_SIZE_M']) + grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv( + B.size(1), META['BLOCK_SIZE_N']), ) + + input_dtype = str(A.dtype).split('.')[-1] + + if (use_int8_w8a16 or use_int4_w4a16 or use_int4_w4a8) and \ + block_shape is not None and block_shape[1] > 0: + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + offset_max = 2**31 - 1 + use_addr_offset_int64_a = A.numel() * A.element_size() >= offset_max + use_addr_offset_int64_c = C.numel() * C.element_size() >= offset_max + + if use_int4_w4a8: + return fused_moe_kernel_gptq_awq_w4a8[grid]( + A, + B, + C, + A_scale, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + B.size(1), + A.size(1), + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(-2), + C.stride(-1), + A_scale.stride(0) + if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) + if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + group_k=block_shape[1], + group_size=block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + use_int8_w8a16=use_int8_w8a16, + ck_sorting=ck_sorting, + ck_topk=ck_topk, + NUM_XCDS=1, + **config + ) + + fused_moe_kernel_gptq_awq[grid]( + A, + B, + C, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + B.size(1), + A.size(1), + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(-2), + C.stride(-1), + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + group_size=block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + ck_sorting=ck_sorting, + ck_topk=ck_topk, + NUM_XCDS=1, + **config, + ) + else: + offset_max = 2**31 - 1 + use_addr_offset_int64_a = A.numel() * A.element_size() >= offset_max + use_addr_offset_int64_c = C.numel() * C.element_size() >= offset_max + use_addr_offset_int64_b = B.numel() * B.element_size() >= offset_max + + config = config.copy() + BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K") + if block_shape is not None: + BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], + block_shape[1])) + + if use_int4_w4a8 and per_channel_quant: + assert B_scale is not None and B_scale.ndim in (2, 3) + assert B_zp is None + channelwise_config = config.copy() + channelwise_config.pop("USE_MLS_LOAD", None) + channelwise_config.pop("COMBINE_SCALE_LOAD", None) + channelwise_config["USE_MLS_LOAD"] = False + channelwise_config["COMBINE_SCALE_LOAD"] = False + w4a8_grid = lambda META: ( + triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]), + ) + return fused_moe_kernel_gptq_awq_w4a8_channelwise[w4a8_grid]( + A, + B, + C, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + sorted_weights, + expert_ids, + num_tokens_post_padded, + B.size(1), + A.size(1), + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(-2), + C.stride(-1), + A_scale.stride(0) + if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) + if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0), + B_scale.stride(1), + MUL_ROUTED_WEIGHT=mul_routed_weight, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_B=use_addr_offset_int64_b, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + use_int4_w4a8=use_int4_w4a8, + ck_sorting=ck_sorting, + ck_topk=ck_topk, + NUM_XCDS=1, + BLOCK_SIZE_K=BLOCK_SIZE_K, + **channelwise_config, + ) + + if use_addr_offset_int64_b: + raise Exception("use_addr_offset_int64_b is not written for ep some cases!") + + SPLIT_K = config.pop("SPLIT_K", splitk_size) + if mul_routed_weight: + SPLIT_K = 0 + + if _USE_MOE_PERSISTENT_KERNEL: + # Note: the sms count is irrelevant to occupancy(lds and regs). + NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count * 2 + + grid = lambda META: ( + min( + NUM_SMS, + triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]), + ), + ) + + fused_moe_persistent_kernel[grid]( + A, + B, + C, + topk_weights, + num_tokens_post_padded, + expert_ids, + sorted_token_ids, + sorted_weights, + A_scale, + B_scale, + B.size(1), + B.size(2), + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(-2), + C.stride(-1), + A_scale.stride(0) + if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) + if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) + if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) + if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) + if B_scale is not None and B_scale.ndim >= 2 else 0, + A.size(0), + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + per_channel_quant=per_channel_quant, + c_sorted=c_sorted, + bottom_a_use_mls_load=bottom_a_use_mls_load, + ck_sorting=ck_sorting, + ck_topk=ck_topk, + NUM_SMS=NUM_SMS, + NUM_XCDS=1, + BLOCK_SIZE_K=BLOCK_SIZE_K, + COMBINE_SCALE_LOAD=config.pop("COMBINE_SCALE_LOAD", None), + **config, + ) + elif SPLIT_K > 1: + + grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv( + B.shape[1], META['BLOCK_SIZE_N']), SPLIT_K) + assert B.size(2) % (BLOCK_SIZE_K * SPLIT_K) == 0, "B.size(2) must be divisible by BLOCK_SIZE_K * SPLIT_K" + + splitk_cache = torch.zeros((SPLIT_K,) + C.shape, device=C.device,dtype=C.dtype) + + use_addr_offset_int64_c = C.numel() * C.element_size() * SPLIT_K >= offset_max + + fused_moe_splitk_kernel[grid]( + A, + B, + splitk_cache, + topk_weights, + num_tokens_post_padded, + expert_ids, + sorted_token_ids, + sorted_weights, + A_scale, + B_scale, + B.size(1), + B.size(2), + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + splitk_cache.stride(-3), + splitk_cache.stride(-2), + splitk_cache.stride(-1), + A_scale.stride(0) + if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) + if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) + if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) + if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) + if B_scale is not None and B_scale.ndim >= 2 else 0, + A.size(0), + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + SPLIT_K=SPLIT_K, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + per_channel_quant=per_channel_quant, + c_sorted=c_sorted, + bottom_a_use_mls_load=bottom_a_use_mls_load, + ck_sorting=ck_sorting, + ck_topk=ck_topk, + NUM_XCDS=1, + BLOCK_SIZE_K=BLOCK_SIZE_K, + COMBINE_SCALE_LOAD=config.pop("COMBINE_SCALE_LOAD", None), + **config, + ) + torch.sum(splitk_cache, dim=0, out=C) + # # C.copy_(torch.sum(splitk_cache.to(torch.float32), dim=0).to(C.dtype)) + # triton_splitk_reduce(splitk_cache, C) + else: + fused_moe_kernel[grid]( + A, + B, + C, + topk_weights, + num_tokens_post_padded, + expert_ids, + sorted_token_ids, + sorted_weights, + A_scale, + B_scale, + B.size(1), + B.size(2), + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(-2), + C.stride(-1), + A_scale.stride(0) + if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) + if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) + if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) + if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) + if B_scale is not None and B_scale.ndim >= 2 else 0, + A.size(0), + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + USE_ADDR_OFFSET_INT64_A=use_addr_offset_int64_a, + USE_ADDR_OFFSET_INT64_C=use_addr_offset_int64_c, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + per_channel_quant=per_channel_quant, + c_sorted=c_sorted, + bottom_a_use_mls_load=bottom_a_use_mls_load, + ck_sorting=ck_sorting, + ck_topk=ck_topk, + NUM_XCDS=1, + BLOCK_SIZE_K=BLOCK_SIZE_K, + COMBINE_SCALE_LOAD=config.pop("COMBINE_SCALE_LOAD", None), + **config, + ) diff --git a/aiter/ops/triton/moe_op_e2e.py b/aiter/ops/triton/moe_op_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..d4307bde57e7ed588e2d84aa99ab19ce0adae918 --- /dev/null +++ b/aiter/ops/triton/moe_op_e2e.py @@ -0,0 +1,766 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +from typing import Any, Dict, Optional + +from aiter.ops.triton.quant import dynamic_per_tensor_quant_fp8_i8 +from aiter.ops.triton.utils.types import torch_to_triton_dtype + +# Source: +# MoE Kernel adapted from VLLM + +_PADDING_SIZE = 0 + +_MOE_A_QUANT_FUNC = dynamic_per_tensor_quant_fp8_i8 + +_USE_MOE_PERSISTENT_KERNEL = False + + +def moe_set_use_persistent_kernel(value: bool): + global _USE_MOE_PERSISTENT_KERNEL + _USE_MOE_PERSISTENT_KERNEL = value + + +def moe_set_padding_size(size: int): + """ + Override padding size + """ + global _PADDING_SIZE + _PADDING_SIZE = size + + +def moe_set_quant_func(func): + """ + Override 'A' matrix ie activations quantization function. + Default function does dynamic quantization. + """ + global _MOE_A_QUANT_FUNC + _MOE_A_QUANT_FUNC = func + + +@triton.heuristics( + { + "GRID_MN": lambda args: triton.cdiv(args["EM"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]) + } +) +@triton.jit +def e2e_moe_kernel( + A, + W1, + W2, + Out, + A_scale, + W1_scale, + W2_scale, + stride_am, + stride_ak, + stride_w1e, + stride_w1n, + stride_w1k, + stride_w2e, + stride_w2n, + stride_w2k, + stride_cm, + stride_w1se, + stride_w1sn, + stride_w2se, + stride_w2sk, + top_k: tl.constexpr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + num_valid_tokens, + EM: tl.constexpr, + N: tl.constexpr, + K: tl.constexpr, + EVEN_K: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K1: tl.constexpr, # original block_size_k + BLOCK_SIZE_K2: tl.constexpr, # outputs (EM, BLOCK_SIZE_K2) + GROUP_SIZE_M: tl.constexpr, + GRID_MN: tl.constexpr, + atomic_num_stages: tl.constexpr, + dtype: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - a: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - w1: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - w2: The stacked MOE weight tensor with shape (E, K, N // 2), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - c: The output cache tensor with shape (M, topk, K), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: a tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: a tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in a. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_w1e > 0) + tl.assume(stride_w1n > 0) + tl.assume(stride_w1k > 0) + tl.assume(stride_w2e > 0) + tl.assume(stride_w2n > 0) + tl.assume(stride_w2k > 0) + tl.assume(stride_cm > 0) + if use_int8_w8a16: + tl.assume(stride_w1se > 0) + tl.assume(stride_w1sn > 0) + tl.assume(stride_w2se > 0) + tl.assume(stride_w2sk > 0) + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + NUM_XCDS: tl.constexpr = 8 + + ## pid remapping on xcds + # Number of pids per XCD in the new arrangement + pids_per_xcd = (GRID_MN + NUM_XCDS - 1) // NUM_XCDS + # When GRID_MN cannot divide NUM_XCDS, some xcds will have + # pids_per_xcd pids, the other will have pids_per_xcd - 1 pids. + # We calculate the number of xcds that have pids_per_xcd pids as + # tall_xcds + tall_xcds = GRID_MN % NUM_XCDS + tall_xcds = NUM_XCDS if tall_xcds == 0 else tall_xcds + # Compute current XCD and local pid within the XCD + xcd = pid % NUM_XCDS + local_pid = pid // NUM_XCDS + # Calculate new pid based on the new grouping + # Note that we need to consider the following two cases: + # 1. the current pid is on a tall xcd + # 2. the current pid is on a short xcd + if xcd < tall_xcds: + pid = xcd * pids_per_xcd + local_pid + else: + pid = ( + tall_xcds * pids_per_xcd + + (xcd - tall_xcds) * (pids_per_xcd - 1) + + local_pid + ) + + if GROUP_SIZE_M == 1: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + else: + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: + return + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m) + offs_k1 = tl.arange(0, BLOCK_SIZE_K1) + offs_k2 = tl.arange(0, BLOCK_SIZE_K2) + + BLOCK_SIZE_HALF: tl.constexpr = BLOCK_SIZE_N // 2 + i = tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + # [0, 0, 1, 1, ..., BLOCK_SIZE_HALF - 1, BLOCK_SIZE_HALF - 1] + i_floor = i // 2 + offs_half = (pid_n * (BLOCK_SIZE_N // 2) + i_floor) % (N // 2) + # (i % 2): [0, 1, 0, 1, ...] (alternating) + # (i % 2) * (N // 2) : [0, (N // 2), 0, (N // 2),...] + # So offs_w1n now takes element from the first BLOCK_SIZE_HALF half and the second BLOCK_SIZE_HALF half in an alternating way (This allows us to do reshape without permute) + offs_w1n = (offs_half + (i % 2) * (N // 2)) % N + + mask_w1n = (pid_n * BLOCK_SIZE_N + i) < N + + a_ptrs = A + ( + offs_token[:, None] // top_k * stride_am + offs_k1[None, :] * stride_ak + ) + w1_ptrs = ( + W1 + + off_experts * stride_w1e + + (offs_k1[:, None] * stride_w1k + offs_w1n[None, :] * stride_w1n) + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + if use_int8_w8a16: + w1_scale_ptrs = ( + W1_scale + off_experts * stride_w1se + offs_w1n[None, :] * stride_w1sn + ) + w1_scale = tl.load(w1_scale_ptrs) + + if use_fp8_w8a8: + a_scale = tl.load(A_scale) + w1_scale = tl.load(W1_scale + off_experts) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K1)): + # Masking ensures we don't load from invalid tokens or indices + if EVEN_K: + a = tl.load(a_ptrs, mask=(token_mask[:, None]), other=0.0) + w1 = tl.load(w1_ptrs, mask=mask_w1n[None, :], other=0.0) + else: + a = tl.load( + a_ptrs, + mask=(token_mask[:, None] & (offs_k1[None, :] < K - k * BLOCK_SIZE_K1)), + other=0.0, + ) + w1 = tl.load( + w1_ptrs, + mask=(offs_k1[:, None] < K - k * BLOCK_SIZE_K1) & mask_w1n[None, :], + other=0.0, + ) + # w1 = tl.zeros((BLOCK_SIZE_K1, BLOCK_SIZE_N), dtype=dtype) + + if use_int8_w8a16: + accumulator = tl.dot(a, w1.to(a.type), acc=accumulator) + elif use_fp8_w8a8: + accumulator += tl.dot(a, w1) + else: + accumulator = tl.dot(a, w1, acc=accumulator) + a_ptrs += BLOCK_SIZE_K1 * stride_ak + w1_ptrs += BLOCK_SIZE_K1 * stride_w1k + + if use_int8_w8a16: + accumulator = accumulator * w1_scale + elif use_fp8_w8a8: + accumulator = accumulator * a_scale * w1_scale + + silu_acc, mul_acc = accumulator.reshape(BLOCK_SIZE_M, BLOCK_SIZE_HALF, 2).split() + silu_acc = silu_acc / (1.0 + tl.exp2(-(silu_acc * 1.44269504089))) + acc = (silu_acc * mul_acc).to(dtype) + + # TODO scale acc + acc_scale = 1.0 + # TODO scale acc + # ------------------------------- + + offs_w2n = tl.arange(0, BLOCK_SIZE_N // 2) + pid_n * (BLOCK_SIZE_N // 2) + + w2_ptrs = ( + W2 + + off_experts * stride_w2e + + (offs_k2[None, :] * stride_w2k + offs_w2n[:, None] * stride_w2n) + ) + out_ptrs = Out + stride_cm * offs_token[:, None] + offs_k2[None, :] + + # if use_int8_w8a16: + # w2_scale_ptrs = W2_scale + off_experts * stride_w2se + offs_w2n[None, :] + # w2_scale = tl.load(w2_scale_ptrs) + if use_fp8_w8a8: + # acc_quantized, _, acc_scale = quantize_tensor_triton(acc, dtype=fp8_type) + w2_scale = tl.load(W2_scale + off_experts) + + # minus if pid_m is even otherwise positive + k_sign = (pid_m % 2) * 2 - 1 + num_k = tl.cdiv(K, BLOCK_SIZE_K2) + for _k in tl.range(0, num_k, num_stages=atomic_num_stages): + k = (num_k + (_k * k_sign)) % num_k + k = ((k + pid_n * 4)) % num_k + # k = _k + + if use_int8_w8a16: + w2_scale_ptrs = ( + W2_scale + + off_experts * stride_w2se + + (offs_k2 + k * BLOCK_SIZE_K2)[None, :] * stride_w2sk + ) + w2_scale = tl.load(w2_scale_ptrs) + + if EVEN_K: + w2 = tl.load( + w2_ptrs + k * BLOCK_SIZE_K2 * stride_w2k, + mask=(offs_w2n[:, None] < N // 2), + other=0.0, + ) + else: + w2 = tl.load( + w2_ptrs + k * BLOCK_SIZE_K2 * stride_w2k, + mask=( + (offs_w2n[:, None] < N // 2) + & ((offs_k2 + k * BLOCK_SIZE_K2)[None, :] < K) + ), + other=0.0, + ) + # w2 = tl.zeros((BLOCK_SIZE_HALF, BLOCK_SIZE_K2), dtype=dtype) + + if use_int8_w8a16: + out = tl.dot(acc, w2.to(dtype)) + else: + out = tl.dot(acc, w2) + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load( + topk_weights_ptr + offs_token, mask=token_mask, other=0 + ) + out = out * moe_weight[:, None] + + if use_int8_w8a16: + out = out * w2_scale + elif use_fp8_w8a8: + out = out * acc_scale * w2_scale + + # # atomic add + if EVEN_K: + c_mask = token_mask[:, None] + else: + c_mask = token_mask[:, None] & ((offs_k2 + k * BLOCK_SIZE_K2)[None, :] < K) + + # TODO check scope + tl.atomic_add( + out_ptrs + k * BLOCK_SIZE_K2, + out.to(dtype), + mask=c_mask, + sem="relaxed", + scope="cta", + ) + # tl.store(out_ptrs + k * BLOCK_SIZE_K2, out, mask=c_mask) + + +@triton.jit +def e2e_moe_persistent_kernel( + A, + W1, + W2, + intermediate_ptr, + Out, + A_scale, + W1_scale, + W2_scale, + stride_am, + stride_ak, + stride_w1e, + stride_w1n, + stride_w1k, + stride_w2e, + stride_w2n, + stride_w2k, + stride_cm, + stride_w1se, + stride_w1sn, + stride_w2se, + stride_w2sk, + stride_im, + top_k: tl.constexpr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + num_valid_tokens, + EM: tl.constexpr, + N: tl.constexpr, + K: tl.constexpr, + EVEN_K: tl.constexpr, + EVEN_N: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N1: tl.constexpr, + BLOCK_SIZE_N2: tl.constexpr, + BLOCK_SIZE_K1: tl.constexpr, # original block_size_k + BLOCK_SIZE_K2: tl.constexpr, # outputs (EM, BLOCK_SIZE_K2) + NUM_SMS: tl.constexpr, +): + start_m = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n: tl.constexpr = tl.cdiv(N, BLOCK_SIZE_N1) + num_pid_k: tl.constexpr = tl.cdiv(K, BLOCK_SIZE_K2) + m_tile_per_sm = num_pid_m // NUM_SMS + + if start_m < num_pid_m % NUM_SMS: + m_tile_per_sm += 1 + + N_HALF: tl.constexpr = N // 2 + BLOCK_SIZE_HALF: tl.constexpr = BLOCK_SIZE_N1 // 2 + + offs_k1 = tl.arange(0, BLOCK_SIZE_K1) + offs_k2 = tl.arange(0, BLOCK_SIZE_K2) + offs_n1 = tl.arange(0, BLOCK_SIZE_N1) + offs_n1_half = tl.arange(0, BLOCK_SIZE_HALF) + offs_n2 = tl.arange(0, BLOCK_SIZE_N2) + offs_m = tl.arange(0, BLOCK_SIZE_M) + i = offs_n1.to(tl.int64) + # [0, 0, 1, 1, ..., BLOCK_SIZE_HALF - 1, BLOCK_SIZE_HALF - 1] + i_floor = i // 2 + + dtype = Out.dtype.element_ty + + pid_m = start_m + + for _ in range(0, m_tile_per_sm): + # pid_m = pid_m_start + m_off + offs_token_id = pid_m * BLOCK_SIZE_M + offs_m + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + # Here we assume that valid tokens are in the range [0, M). + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m) + # tl.device_print("pid_m", pid_m) + # TODO mem fault when when pid_n != 0 + for pid_n in range(0, num_pid_n): + offs_half = (pid_n * BLOCK_SIZE_HALF + i_floor) % N_HALF + # (i % 2): [0, 1, 0, 1, ...] (alternating) + # (i % 2) * (N // 2) : [0, (N // 2), 0, (N // 2),...] + # So offs_w1n now takes element from the first BLOCK_SIZE_HALF half and the second BLOCK_SIZE_HALF half in an alternating way (This allows us to do reshape without permute) + offs_w1n = (offs_half + (i % 2) * (N_HALF)) % N + + mask_w1n = (pid_n * BLOCK_SIZE_N1 + i) < N + + a_ptrs = A + ( + offs_token[:, None] // top_k * stride_am + offs_k1[None, :] * stride_ak + ) + w1_ptrs = ( + W1 + + off_experts * stride_w1e + + (offs_k1[:, None] * stride_w1k + offs_w1n[None, :] * stride_w1n) + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N1), dtype=tl.float32) + + if use_int8_w8a16: + w1_scale_ptrs = ( + W1_scale + + off_experts * stride_w1se + + offs_w1n[None, :] * stride_w1sn + ) + w1_scale = tl.load(w1_scale_ptrs) + if use_fp8_w8a8: + a_scale = tl.load(A_scale) + w1_scale = tl.load(W1_scale + off_experts) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K1)): + # Masking ensures we don't load from invalid tokens or indices + if EVEN_K: + a = tl.load(a_ptrs, mask=(token_mask[:, None]), other=0.0) + # TODO memory fault N dim, might be k as well + w1 = tl.load(w1_ptrs, mask=mask_w1n[None, :], other=0.0) + else: + a = tl.load( + a_ptrs, + mask=( + token_mask[:, None] + & (offs_k1[None, :] < K - k * BLOCK_SIZE_K1) + ), + other=0.0, + ) + w1 = tl.load( + w1_ptrs, + mask=(offs_k1[:, None] < K - k * BLOCK_SIZE_K1) + & mask_w1n[None, :], + other=0.0, + ) + + if use_int8_w8a16: + accumulator = tl.dot(a, w1.to(a.type), acc=accumulator) + elif use_fp8_w8a8: + accumulator += tl.dot(a, w1) + else: + accumulator = tl.dot(a, w1, acc=accumulator) + a_ptrs += BLOCK_SIZE_K1 * stride_ak + w1_ptrs += BLOCK_SIZE_K1 * stride_w1k + + if use_int8_w8a16: + accumulator = accumulator * w1_scale + elif use_fp8_w8a8: + accumulator = accumulator * a_scale * w1_scale + + silu_acc, mul_acc = accumulator.reshape( + BLOCK_SIZE_M, BLOCK_SIZE_HALF, 2 + ).split() + silu_acc = silu_acc / (1.0 + tl.exp2(-(silu_acc * 1.44269504089))) + acc = (silu_acc * mul_acc).to(dtype) + + offs_in = pid_n * BLOCK_SIZE_HALF + offs_n1_half + i_mask = token_mask[:, None] & (offs_in[None, :] < N_HALF) + i_ptrs = ( + intermediate_ptr + stride_im * offs_token[:, None] + offs_in[None, :] + ) + # TODO dtye?? + tl.atomic_add(i_ptrs, acc, mask=i_mask, sem="release") + # TODO quantization + + for pid_k in range(0, num_pid_k): + offs_w2k = (pid_k * BLOCK_SIZE_K2 + offs_k2) % K + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + + intermediate_ptrs = intermediate_ptr + ( + offs_token[:, None] * stride_im + offs_n2[None, :] + ) + w2_ptrs = ( + W2 + + off_experts * stride_w2e + + (offs_n2[:, None] * stride_w2n + offs_w2k[None, :] * stride_w2k) + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K2), dtype=tl.float32) + + mask_w2k = (pid_k * BLOCK_SIZE_K2 + offs_k2) < K + + if use_int8_w8a16: + w2_scale_ptrs = ( + W2_scale + + off_experts * stride_w2se + + offs_k2[None, :] * stride_w2sk + ) + w2_scale = tl.load(w2_scale_ptrs) + + if use_fp8_w8a8: + # TODO calculate the intermediate scale and scale intermediate + # a_scale = tl.load(A_scale) + i_scale = 1 + w2_scale = tl.load(W2_scale + off_experts) + + for n in range(0, tl.cdiv(N_HALF, BLOCK_SIZE_N2)): + # Masking ensures we don't load from invalid tokens or indices + + if EVEN_N: + intermediate = tl.load( + intermediate_ptrs, mask=(token_mask[:, None]), other=0.0 + ) + w2 = tl.load(w2_ptrs) + else: + intermediate = tl.load( + intermediate_ptrs, + mask=( + token_mask[:, None] + & (offs_n2[None, :] < N_HALF - n * BLOCK_SIZE_N2) + ), + other=0.0, + ) + w2 = tl.load( + w2_ptrs, + mask=(offs_n2[:, None] < N_HALF - n * BLOCK_SIZE_N2) + & mask_w2k[None, :], + other=0.0, + ) + + if use_int8_w8a16: + accumulator = tl.dot( + intermediate.to(dtype), w2.to(dtype), acc=accumulator + ) + elif use_fp8_w8a8: + accumulator += tl.dot(intermediate, w2) + else: + accumulator = tl.dot(intermediate.to(dtype), w2, acc=accumulator) + intermediate_ptrs += BLOCK_SIZE_N2 + w2_ptrs += BLOCK_SIZE_N2 * stride_w2n + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load( + topk_weights_ptr + offs_token, mask=token_mask, other=0 + ) + accumulator = accumulator * moe_weight[:, None] + + if use_int8_w8a16: + accumulator = accumulator * w2_scale + elif use_fp8_w8a8: + accumulator = accumulator * i_scale * w2_scale + + offs_ck = pid_k * BLOCK_SIZE_K2 + offs_k2 + c_mask = token_mask[:, None] & (offs_ck[None, :] < K) + out_ptrs = Out + stride_cm * offs_token[:, None] + offs_ck[None, :] + tl.store(out_ptrs, accumulator.to(dtype), mask=c_mask) + pid_m += NUM_SMS + + +def e2e_moe( + A: torch.Tensor, + W1: torch.Tensor, + W2: torch.Tensor, + Intermediate: torch.Tensor, + C: torch.Tensor, + A_scale: Optional[torch.Tensor], + W1_scale: Optional[torch.Tensor], + W2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + topk_ids, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + config: Optional[Dict[str, Any]] = None, +) -> None: + """ + #TODO: Add doc + """ + assert topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + # if use_fp8_w8a8: + # assert W1_scale is not None + # assert W2_scale is not None + # if block_shape is None: + # output = torch.zeros(A.shape, device=A.device, dtype=torch.float8_e4m3fnuz) + # A_scale = torch.zeros(1, device=A.device, dtype=torch.float32) + # A, A_scale = _MOE_A_QUANT_FUNC(output, A, A_scale) + # else: + # #TODO: Add support for per token group quantization + # assert len(block_shape) == 2 + # block_n, block_k = block_shape[0], block_shape[1] + # #A, A_scale = per_token_group_quant_fp8(A, block_k) + # assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + # assert triton.cdiv(W1.shape[-2], block_n) == B_scale.shape[-2] + # assert triton.cdiv(W1.shape[-1], block_k) == B_scale.shape[-1] + # elif use_int8_w8a16 or use_int4_w4a16: + # assert W1_scale is not None + # assert W2_scale is not None + # assert block_shape is None or block_shape[0] == 0 + # else: + # assert A_scale is None + # assert W1_scale is None + # assert W2_scale is None + + EM = sorted_token_ids.shape[0] + if A.shape[0] < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.shape[0], A.shape[0] * top_k * config["BLOCK_SIZE_M"]) + + N = W1.shape[1] + K = A.shape[1] - _PADDING_SIZE + EVEN_K = K % config["BLOCK_SIZE_K1"] == 0 + + if EM > 1024: + atomic_num_stages = 2 + else: + atomic_num_stages = 1 + + stride_cm = C.stride(1) + if _USE_MOE_PERSISTENT_KERNEL: + NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count * 2 + # TODO add N_split support to get more parallelism + grid = lambda META: ( # noqa: E731 + min(NUM_SMS, triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"])), + ) + stride_im = Intermediate.stride(0) + EVEN_N = (N // 2) % config["BLOCK_SIZE_N2"] == 0 + + e2e_moe_persistent_kernel[grid]( + A, + W1, + W2, + Intermediate, + C, + A_scale, + W1_scale, + W2_scale, + A.stride(0), + A.stride(1), + W1.stride(0), + W1.stride(1), + W1.stride(2), + W2.stride(0), + W2.stride(2), + W2.stride(1), + stride_cm, + W1_scale.stride(0) if W1_scale is not None and W1_scale.ndim >= 2 else 0, + W1_scale.stride(1) if W1_scale is not None and W1_scale.ndim >= 2 else 0, + W2_scale.stride(0) if W2_scale is not None and W2_scale.ndim >= 2 else 0, + W1_scale.stride(1) if W2_scale is not None and W2_scale.ndim >= 2 else 0, + stride_im, + top_k, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + topk_ids.numel(), + EM, + N, + K, + EVEN_K, + EVEN_N, + MUL_ROUTED_WEIGHT=mul_routed_weight, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + NUM_SMS=NUM_SMS, + **config, + ) + + return C + else: + grid = lambda META: ( # noqa: E731 + triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(W1.shape[1], META["BLOCK_SIZE_N"]), + ) + dtype = C.dtype + Out = C.to(torch.float32) if dtype == torch.bfloat16 else C + + e2e_moe_kernel[grid]( + A, + W1, + W2, + Out, + A_scale, + W1_scale, + W2_scale, + A.stride(0), + A.stride(1), + W1.stride(0), + W1.stride(1), + W1.stride(2), + W2.stride(0), + W2.stride(2), + W2.stride(1), + stride_cm, + W1_scale.stride(0) if W1_scale is not None and W1_scale.ndim >= 2 else 0, + W1_scale.stride(1) if W1_scale is not None and W1_scale.ndim >= 2 else 0, + W2_scale.stride(0) if W2_scale is not None and W2_scale.ndim >= 2 else 0, + W1_scale.stride(1) if W2_scale is not None and W2_scale.ndim >= 2 else 0, + top_k, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + topk_ids.numel(), + EM, + N, + K, + EVEN_K, + MUL_ROUTED_WEIGHT=mul_routed_weight, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + atomic_num_stages=atomic_num_stages, + dtype=torch_to_triton_dtype[dtype], + **config, + ) + + return Out.to(dtype) diff --git a/aiter/ops/triton/moe_op_gelu.py b/aiter/ops/triton/moe_op_gelu.py new file mode 100644 index 0000000000000000000000000000000000000000..63ff1d0b759b0ac7c435526a6c2915b27d227d37 --- /dev/null +++ b/aiter/ops/triton/moe_op_gelu.py @@ -0,0 +1,616 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +from typing import Any, Dict, Optional, List + +from aiter.ops.triton.quant import dynamic_per_tensor_quant_fp8_i8 +from aiter.ops.triton.activation import _gelu_tanh +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +from aiter.ops.triton.utils.moe_common import _write_zeros_to_output + +# Source: +# MoE Kernel adapted from VLLM + +_PADDING_SIZE = 0 + +_MOE_A_QUANT_FUNC = dynamic_per_tensor_quant_fp8_i8 + +_USE_MOE_PERSISTENT_KERNEL = False + + +def moe_set_use_persistent_kernel(value: bool): + global _USE_MOE_PERSISTENT_KERNEL + _USE_MOE_PERSISTENT_KERNEL = value + + +def moe_set_padding_size(size: int): + """ + Override padding size + """ + global _PADDING_SIZE + _PADDING_SIZE = size + + +def moe_set_quant_func(func): + """ + Override 'A' matrix ie activations quantization function. + Default function does dynamic quantization. + """ + global _MOE_A_QUANT_FUNC + _MOE_A_QUANT_FUNC = func + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + } +) +@triton.jit +def _fused_moe_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Block size for block-wise quantization + BLOCK_SCALE: tl.constexpr, + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + NUM_XCDS: tl.constexpr = 8 + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + _write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, + ) + return + + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + ) + if use_int8_w8a16: + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn + ) + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8: + # if group_k > 0 and group_n > 0: + if BLOCK_SCALE: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn + ) + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if EVEN_K: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + else: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8: + # if group_k > 0 and group_n > 0: + if BLOCK_SCALE: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0 + ) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + if use_int8_w8a16: + accumulator = accumulator * b_scale + elif use_fp8_w8a8: + # if group_k > 0 and group_n > 0: + if BLOCK_SCALE: + accumulator = accumulator + else: + accumulator = accumulator * a_scale * b_scale + + if not MUL_ROUTED_WEIGHT: + accumulator = _gelu_tanh(accumulator) + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + } +) +@triton.jit +def _fused_moe_persistent_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Block size for block-wise quantization + BLOCK_SCALE: tl.constexpr, # True if group_n and group_k are true + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + NUM_SMS: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + This is the persistent version of the fused_moe kernel. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Simply compute how many iterations each persistent block needs to do + start_pid = tl.program_id(axis=0) + NUM_XCDS: tl.constexpr = 8 + + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + tile_id = start_pid + + offs_k = tl.arange(0, BLOCK_SIZE_K) + + # Load tile-invariant runtime constant + num_tiles = num_pid_m * num_pid_n + + # Compute how many tiles are outside the padding region + num_valid_tiles = tl.cdiv((num_tiles - tile_id), NUM_SMS) + + for _ in range(0, num_valid_tiles): + tile_id_remapped = remap_xcd(tile_id, num_tiles, NUM_XCDS) + pid_m, pid_n = pid_grid(tile_id_remapped, num_pid_m, num_pid_n, GROUP_SIZE_M) + + # Compute the mask + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + + # Compute the A pointer + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + # Compute the B pointer + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + ) + + if use_int8_w8a16: + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn + ) + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8: + # if group_k > 0 and group_n > 0: + if BLOCK_SCALE: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn + ) + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if EVEN_K: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + else: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0 + ) + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8: + # if group_k > 0 and group_n > 0: + if BLOCK_SCALE: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0 + ) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load( + topk_weights_ptr + offs_token, mask=token_mask, other=0 + ) + accumulator = accumulator * moe_weight[:, None] + + if use_int8_w8a16: + accumulator = accumulator * b_scale + elif use_fp8_w8a8: + # if group_k > 0 and group_n > 0: + if BLOCK_SCALE: + accumulator = accumulator + else: + accumulator = accumulator * a_scale * b_scale + + if not MUL_ROUTED_WEIGHT: + accumulator = _gelu_tanh(accumulator) + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + # advance tile_id + tile_id += NUM_SMS + + +def fused_moe_gelu( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + compute_type: tl.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + block_shape: Optional[List[int]] = None, + config: Optional[Dict[str, Any]] = None, +) -> None: + """ + #TODO: Add doc + """ + assert topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + if use_fp8_w8a8: + assert B_scale is not None + if block_shape is None: + output = torch.zeros(A.shape, device=A.device, dtype=torch.float8_e4m3fnuz) + A_scale = torch.zeros(1, device=A.device, dtype=torch.float32) + A, A_scale = _MOE_A_QUANT_FUNC(output, A, A_scale) + else: + # TODO: Add support for per token group quantization + assert len(block_shape) == 2 + block_n, block_k = block_shape[0], block_shape[1] + # A, A_scale = per_token_group_quant_fp8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] + assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] + elif use_int8_w8a16: + assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + + EM = sorted_token_ids.shape[0] + if A.shape[0] < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.shape[0], A.shape[0] * top_k * config["BLOCK_SIZE_M"]) + + group_k = 0 if block_shape is None else block_shape[0] + group_n = 0 if block_shape is None else block_shape[1] + if _USE_MOE_PERSISTENT_KERNEL: + NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count * 2 + grid = lambda META: ( # noqa: E731 + min( + NUM_SMS, + triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]), + ), + ) + + _fused_moe_persistent_kernel[grid]( + A, + B, + C, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1] - _PADDING_SIZE, + sorted_token_ids.shape[0], + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + BLOCK_SCALE=group_k > 0 and group_n > 0, + group_k=group_k, + group_n=group_n, + NUM_SMS=NUM_SMS, + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + **config, + ) + else: + grid = lambda META: ( # noqa: E731 + triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]), + ) + _fused_moe_kernel[grid]( + A, + B, + C, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1] - _PADDING_SIZE, + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + BLOCK_SCALE=group_k > 0 and group_n > 0, + group_k=group_k, + group_n=group_n, + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + **config, + ) diff --git a/aiter/ops/triton/moe_op_mxfp4.py b/aiter/ops/triton/moe_op_mxfp4.py new file mode 100644 index 0000000000000000000000000000000000000000..7a3918bda1a0330297cac1aaa52ff70e4c650fc2 --- /dev/null +++ b/aiter/ops/triton/moe_op_mxfp4.py @@ -0,0 +1,450 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +from typing import Any, Dict +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +from aiter.ops.triton.utils.moe_common import _write_zeros_to_output + + +@tl.constexpr_function +def get_scaled_dot_format_string(dtype: tl.dtype): + mapping = { + tl.float16: "fp16", + tl.bfloat16: "bf16", + tl.uint8: "e2m1", + tl.float8e4nv: "e4m3", + tl.float8e5: "e5m2", + } + return mapping[dtype] + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + } +) +@triton.jit +def _fused_moe_kernel_mxfp4( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + a_mx_scale_ptr, + b_mx_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # Strides + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_amxm, + stride_amxk, + stride_bmxe, + stride_bmxk, + stride_bmxn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + SWIZZLE_MX_A: tl.constexpr, # TODO add swizzle support + SWIZZLE_MX_B: tl.constexpr, # TODO add swizzle support +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + is_a_microscaled_format: tl.constexpr = a_mx_scale_ptr is not None + is_b_microscaled_format: tl.constexpr = b_mx_scale_ptr is not None + MX_PACK_DIVISOR: tl.constexpr = 32 + if is_a_microscaled_format: + a_type: tl.constexpr = a_ptr.dtype.element_ty + tl.static_assert( + a_type == tl.uint8 or (a_type == tl.float8e4nv or a_type == tl.float8e5), + "mx_weight_ptr must be 1 byte", + ) + tl.static_assert( + a_mx_scale_ptr.dtype.element_ty == tl.uint8, "a_mx_scale_ptr must be uint8" + ) + tl.static_assert( + BLOCK_SIZE_K % MX_PACK_DIVISOR == 0, + "BLOCK_SIZE_K must be a multiple of MX_PACK_DIVISOR", + ) + if is_b_microscaled_format: + b_type: tl.constexpr = b_ptr.dtype.element_ty + tl.static_assert( + b_type == tl.uint8 or (b_type == tl.float8e4nv or b_type == tl.float8e5), + "mx_weight_ptr must be 1 byte", + ) + tl.static_assert( + b_mx_scale_ptr.dtype.element_ty == tl.uint8, "b_mx_scale_ptr must be uint8" + ) + tl.static_assert( + BLOCK_SIZE_K % MX_PACK_DIVISOR == 0, + "BLOCK_SIZE_K must be a multiple of MX_PACK_DIVISOR", + ) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + NUM_XCDS: tl.constexpr = 8 + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_expert = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + if off_expert == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + _write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, + ) + return + + # Load a_scale, b_scale + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_expert) + # Set offsets of B on dim N + offs_b_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + offs_b_n = tl.max_contiguous( + tl.multiple_of(offs_b_n % N, BLOCK_SIZE_N), BLOCK_SIZE_N + ) + # Load a_mx_scale + if is_a_microscaled_format: + # We have pack 2 fp4 values in a byte + A_PACK_DIVISOR: tl.constexpr = 2 if a_ptr.dtype.element_ty == tl.uint8 else 1 + PACKED_BLOCK_K_A: tl.constexpr = BLOCK_SIZE_K // A_PACK_DIVISOR # 64 + MX_SCALE_BLOCK_K_A: tl.constexpr = BLOCK_SIZE_K // MX_PACK_DIVISOR # 4 + + if SWIZZLE_MX_A: + tl.static_assert(BLOCK_SIZE_M % 128 == 0) + tl.static_assert(MX_SCALE_BLOCK_K_A % 4 == 0) + PACKED_MX_BLOCK_A: tl.constexpr = (MX_SCALE_BLOCK_K_A // 4) * 32 * 4 * 4 + offs_inner = tl.arange(0, PACKED_MX_BLOCK_A) + offs_scale_m = ( + pid_m * (BLOCK_SIZE_M // 128) + tl.arange(0, BLOCK_SIZE_M // 128) + ) % N + offs_scale_m = tl.max_contiguous( + tl.multiple_of(offs_scale_m, BLOCK_SIZE_M // 128), BLOCK_SIZE_M // 128 + ) + + a_mx_scale_ptrs = ( + a_mx_scale_ptr + + offs_scale_m.to(tl.int64)[:, None] * stride_amxm + + offs_inner[None, :] + ) + else: + offs_scale_ak = tl.arange(0, MX_SCALE_BLOCK_K_A) + offs_scale_m = offs_token + # K dimension must be the last dimension for the scales + a_mx_scale_ptrs = ( + a_mx_scale_ptr + + offs_scale_ak.to(tl.int64)[None, :] * stride_amxk + + offs_scale_m.to(tl.int64)[:, None] // top_k * stride_amxm + ) + else: + a_mx_scale_ptrs = None + A_PACK_DIVISOR: tl.constexpr = 1 + MX_SCALE_BLOCK_K_A: tl.constexpr = 1 + PACKED_BLOCK_K_A: tl.constexpr = BLOCK_SIZE_K + # Load b_mx_scale + if is_b_microscaled_format: + # We have pack 2 fp4 values in a byte + B_PACK_DIVISOR: tl.constexpr = 2 if b_ptr.dtype.element_ty == tl.uint8 else 1 + PACKED_BLOCK_K_B: tl.constexpr = BLOCK_SIZE_K // B_PACK_DIVISOR # 64 + MX_SCALE_BLOCK_K_B: tl.constexpr = BLOCK_SIZE_K // MX_PACK_DIVISOR # 4 + + b_mx_scale_ptr += off_expert * stride_bmxe + + if SWIZZLE_MX_B: + tl.static_assert(BLOCK_SIZE_N % 128 == 0) + tl.static_assert(MX_SCALE_BLOCK_K_B % 4 == 0) + PACKED_MX_BLOCK_B: tl.constexpr = (MX_SCALE_BLOCK_K_B // 4) * 32 * 4 * 4 + offs_inner = tl.arange(0, PACKED_MX_BLOCK_B) + offs_scale_n = ( + pid_n * (BLOCK_SIZE_N // 128) + tl.arange(0, BLOCK_SIZE_N // 128) + ) % N + offs_scale_n = tl.max_contiguous( + tl.multiple_of(offs_scale_n, BLOCK_SIZE_N // 128), BLOCK_SIZE_N // 128 + ) + + b_mx_scale_ptrs = ( + b_mx_scale_ptr + # + offs_scale_n.to(tl.int64)[:, None] * stride_bmxn + + offs_scale_n.to(tl.int64)[:, None] + * PACKED_MX_BLOCK_B + * (K // MX_SCALE_BLOCK_K_B // (MX_PACK_DIVISOR // B_PACK_DIVISOR)) + + offs_inner[None, :] + ) + else: + offs_scale_bk = tl.arange(0, MX_SCALE_BLOCK_K_B) + offs_scale_n = offs_b_n + # K dimension must be the last dimension for the scales + b_mx_scale_ptrs = ( + b_mx_scale_ptr + + offs_scale_bk.to(tl.int64)[None, :] * stride_bmxk + + offs_scale_n.to(tl.int64)[:, None] * stride_bmxn + ) + else: + b_mx_scale_ptrs = None + B_PACK_DIVISOR: tl.constexpr = 1 + MX_SCALE_BLOCK_K_B: tl.constexpr = 1 + PACKED_BLOCK_K_B: tl.constexpr = BLOCK_SIZE_K + + offs_a_k = tl.arange(0, PACKED_BLOCK_K_A) + offs_b_k = tl.arange(0, PACKED_BLOCK_K_B) + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_a_k[None, :] * stride_ak + ) + b_ptrs = ( + b_ptr + + off_expert * stride_be + + (offs_b_k[:, None] * stride_bk + offs_b_n[None, :] * stride_bn) + ) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, PACKED_BLOCK_K_A)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if EVEN_K: + a = tl.load( + a_ptrs, + mask=token_mask[:, None], + other=0.0, + ) + b = tl.load(b_ptrs) + else: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] + & (offs_a_k[None, :] < (K - k * PACKED_BLOCK_K_A)), + other=0.0, + ) + b = tl.load( + b_ptrs, + mask=offs_b_k[:, None] < (K - k * PACKED_BLOCK_K_B), + other=0.0, + ) + # We accumulate along the K dimension. + if is_a_microscaled_format or is_b_microscaled_format: + a_format: tl.constexpr = get_scaled_dot_format_string(a.dtype) + b_format: tl.constexpr = get_scaled_dot_format_string(b.dtype) + if is_a_microscaled_format: + # if SWIZZLE_MX_A: + # a_mx_scales = _unswizzle_mx_block(tl.load(a_mx_scale_ptrs)) + # else: + mask_ak_scale = offs_scale_ak < (K - k * PACKED_BLOCK_K_A) // ( + MX_PACK_DIVISOR // A_PACK_DIVISOR + ) + a_mx_scales = tl.load( + a_mx_scale_ptrs, mask=mask_ak_scale[None, :], other=0.0 + ) + else: + a_mx_scales = None + # if SWIZZLE_MX_B: + # b_mx_scales = _unswizzle_mx_block(tl.load(b_mx_scale_ptrs)) + # else: + mask_bk_scale = offs_scale_bk < (K - k * PACKED_BLOCK_K_B) // ( + MX_PACK_DIVISOR // B_PACK_DIVISOR + ) + b_mx_scales = tl.load( + b_mx_scale_ptrs, mask=mask_bk_scale[None, :], other=0.0 + ) + + accumulator = tl.dot_scaled( + a, + a_mx_scales, + a_format, + b, + b_mx_scales, + b_format, + acc=accumulator, + fast_math=True, + ) + + if is_a_microscaled_format: + if SWIZZLE_MX_A: + a_mx_scale_ptrs += MX_SCALE_BLOCK_K_A // 4 * stride_amxk + else: + a_mx_scale_ptrs += MX_SCALE_BLOCK_K_A * stride_amxk + if SWIZZLE_MX_B: + b_mx_scale_ptrs += MX_SCALE_BLOCK_K_B // 4 * 512 + else: + b_mx_scale_ptrs += MX_SCALE_BLOCK_K_B * stride_bmxk + # Advance the ptrs to the next K block. + a_ptrs += PACKED_BLOCK_K_A * stride_ak + b_ptrs += PACKED_BLOCK_K_B * stride_bk + + # Multiply with the scalar weight + accumulator *= a_scale * b_scale + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def fused_moe_mxfp4( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + A_mx_scale: torch.Tensor, + B_mx_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + swizzle_mx_a: bool, + swizzle_mx_b: bool, + config: Dict[str, Any], + compute_type: tl.dtype, +) -> None: + """ + #TODO: Add doc + """ + assert topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + assert A_scale is not None + assert B_scale is not None + if A.dtype == torch.uint8: + assert A_mx_scale is not None, "A_mx_scale should exist when A is mxfp4" + A_mx_scale_strid_m, A_mx_scale_strid_k = A_mx_scale.stride() + else: + assert A_mx_scale is None, "A_mx_scale should not exist when A is not mxfp4" + A_mx_scale_strid_m, A_mx_scale_strid_k = None, None + # NOTE: Only supports B_mx_scale + assert B_mx_scale is not None + + EM = sorted_token_ids.shape[0] + if A.shape[0] < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.shape[0], A.shape[0] * top_k * config["BLOCK_SIZE_M"]) + + grid = lambda META: ( # noqa: E731 + triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]), + ) + _fused_moe_kernel_mxfp4[grid]( + A, + B, + C, + A_scale, + B_scale, + A_mx_scale, + B_mx_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1], + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + A_mx_scale_strid_m, + A_mx_scale_strid_k, + B_mx_scale.stride(0), + B_mx_scale.stride(2), + B_mx_scale.stride(1), + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + SWIZZLE_MX_A=swizzle_mx_a, # TODO add swizzle support + SWIZZLE_MX_B=swizzle_mx_b, # TODO add swizzle support + **config, + ) diff --git a/aiter/ops/triton/moe_op_silu_fused.py b/aiter/ops/triton/moe_op_silu_fused.py new file mode 100644 index 0000000000000000000000000000000000000000..2889557d37abdcb2b733b546a620d77769dea318 --- /dev/null +++ b/aiter/ops/triton/moe_op_silu_fused.py @@ -0,0 +1,1244 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +from typing import Any, Dict, Optional, List + +from aiter.ops.triton.activation import _silu_exp2 +from aiter.ops.triton.quant import dynamic_per_tensor_quant_fp8_i8 +from aiter.ops.triton.utils.pid_preprocessing import pid_grid, remap_xcd +from aiter.ops.triton.utils.moe_common import _write_zeros_to_output + +# Source: +# MoE Kernel adapted from VLLM + +_PADDING_SIZE = 0 + +_MOE_A_QUANT_FUNC = dynamic_per_tensor_quant_fp8_i8 + +_USE_MOE_PERSISTENT_KERNEL = False + + +def moe_set_use_persistent_kernel(value: bool): + global _USE_MOE_PERSISTENT_KERNEL + _USE_MOE_PERSISTENT_KERNEL = value + + +def moe_set_padding_size(size: int): + """ + Override padding size + """ + global _PADDING_SIZE + _PADDING_SIZE = size + + +def moe_set_quant_func(func): + """ + Override 'A' matrix ie activations quantization function. + Default function does dynamic quantization. + """ + global _MOE_A_QUANT_FUNC + _MOE_A_QUANT_FUNC = func + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + } +) +@triton.jit +def _fused_moe_silu_kernel_gptq_awq( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + b_scale_ptr, + b_zp_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_bse, + stride_bsk, + stride_bsn, + stride_bze, + stride_bzk, + stride_bzn, + block_k_diviable: tl.constexpr, + group_size: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + has_zp: tl.constexpr, + use_int4_w4a16: tl.constexpr, + use_int8_w8a16: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + NUM_XCDS: tl.constexpr = 8 + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + _write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, + ) + return + + # silu ptrs + BLOCK_SIZE_HALF: tl.constexpr = BLOCK_SIZE_N // 2 + i = tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + # [0, 0, 1, 1, ..., BLOCK_SIZE_HALF - 1, BLOCK_SIZE_HALF - 1] + i_floor = i // 2 + offs_half = (pid_n * (BLOCK_SIZE_N // 2) + i_floor) % (N // 2) + # (i % 2): [0, 1, 0, 1,...] (alternating) + # (i % 2) * (N // 2) : [0, (N // 2), 0, (N // 2),...] + # So offs_bn now takes element from the first BLOCK_SIZE_HALF half and the second BLOCK_SIZE_HALF half in an alternating way (This allows us to do reshape without permute) + offs_bn = (offs_half + (i % 2) * (N // 2)) % N + + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + + if use_int4_w4a16: + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] // 2) * stride_bk + + offs_bn[None, :] * stride_bn + ) + b_shifter = (offs_k[:, None] % 2) * 4 + elif use_int8_w8a16: + b_ptrs = ( + b_ptr + + off_experts * stride_be + + offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + + if not has_zp and use_int4_w4a16: + b_zp_num = 8 + if not has_zp and use_int8_w8a16: + b_zp_num = 128 + elif has_zp and use_int4_w4a16: + b_zp_shifter = (offs_bn[None, :] % 2) * 4 + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + + if not block_k_diviable: + k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K + k_other = 0.0 + else: + k_mask = None + k_other = None + + if EVEN_K: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + else: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load(b_ptrs) + + if use_int4_w4a16: + b = (b >> b_shifter) & 0xF + + b_scale_ptrs = ( + b_scale_ptr + + off_experts * stride_bse + + offs_bn[None, :] * stride_bsn + + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk + ) + b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other) + b_scale = b_scale.to(tl.float32) + + if has_zp and use_int4_w4a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = ( + b_zp_ptr + + off_experts * stride_bze + + (offs_bn[None, :] // 2) * stride_bzn + + offs_k_true * stride_bzk + ) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = (b_zp >> b_zp_shifter) & 0xF + b_zp = b_zp.to(tl.float32) + elif has_zp and use_int8_w8a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = ( + b_zp_ptr + + off_experts * stride_bze + + offs_bn[None, :] * stride_bzn + + offs_k_true * stride_bzk + ) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = b_zp.to(tl.float32) + + # We accumulate along the K dimension. + if has_zp: + b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type) + else: + b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type) + accumulator = tl.dot(a, b, acc=accumulator) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + if use_int4_w4a16: + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + else: + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + + silu_acc, mul_acc = ( + accumulator.to(tl.float32).reshape(BLOCK_SIZE_M, BLOCK_SIZE_HALF, 2).split() + ) + # silu_acc = silu_acc / (1.0 + tl.exp2(-(silu_acc * 1.44269504089))) + silu_acc = _silu_exp2(silu_acc) + accumulator = (silu_acc * mul_acc).to(compute_type) + + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_HALF + tl.arange(0, BLOCK_SIZE_HALF) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N // 2) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + } +) +@triton.jit +def _fused_moe_persistent_silu_kernel_gptq_awq( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + b_scale_ptr, + b_zp_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_bse, + stride_bsk, + stride_bsn, + stride_bze, + stride_bzk, + stride_bzn, + block_k_diviable: tl.constexpr, + group_size: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + NUM_SMS: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + has_zp: tl.constexpr, + use_int4_w4a16: tl.constexpr, + use_int8_w8a16: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + start_pid = tl.program_id(axis=0) + NUM_XCDS: tl.constexpr = 8 + # Load tile-invariant runtime constant + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + tile_id = start_pid + + offs_k = tl.arange(0, BLOCK_SIZE_K) + + num_tiles = num_pid_m * num_pid_n + # Compute how many tiles are outside the padding region + num_valid_tiles = tl.cdiv((num_tiles - tile_id), NUM_SMS) + for _ in range(0, num_valid_tiles): + tile_id_remapped = remap_xcd(tile_id, num_tiles, NUM_XCDS) + pid_m, pid_n = pid_grid(tile_id_remapped, num_pid_m, num_pid_n, GROUP_SIZE_M) + + # Compute the mask + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + + # silu ptrs + BLOCK_SIZE_HALF: tl.constexpr = BLOCK_SIZE_N // 2 + i = tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + # [0, 0, 1, 1, ..., BLOCK_SIZE_HALF - 1, BLOCK_SIZE_HALF - 1] + i_floor = i // 2 + offs_half = (pid_n * (BLOCK_SIZE_N // 2) + i_floor) % (N // 2) + # (i % 2): [0, 1, 0, 1,...] (alternating) + # (i % 2) * (N // 2) : [0, (N // 2), 0, (N // 2),...] + # So offs_bn now takes element from the first BLOCK_SIZE_HALF half and the second BLOCK_SIZE_HALF half in an alternating way (This allows us to do reshape without permute) + offs_bn = (offs_half + (i % 2) * (N // 2)) % N + + # Compute the A pointer + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + + if use_int4_w4a16: + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] // 2) * stride_bk + + offs_bn[None, :] * stride_bn + ) + b_shifter = (offs_k[:, None] % 2) * 4 + elif use_int8_w8a16: + b_ptrs = ( + b_ptr + + off_experts * stride_be + + offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + + if not has_zp and use_int4_w4a16: + b_zp_num = 8 + if not has_zp and use_int8_w8a16: + b_zp_num = 128 + elif has_zp and use_int4_w4a16: + b_zp_shifter = (offs_bn[None, :] % 2) * 4 + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + + if not block_k_diviable: + k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K + k_other = 0.0 + else: + k_mask = None + k_other = None + + if EVEN_K: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + else: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load(b_ptrs) + + if use_int4_w4a16: + b = (b >> b_shifter) & 0xF + + b_scale_ptrs = ( + b_scale_ptr + + off_experts * stride_bse + + offs_bn[None, :] * stride_bsn + + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk + ) + b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other) + b_scale = b_scale.to(tl.float32) + + if has_zp and use_int4_w4a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = ( + b_zp_ptr + + off_experts * stride_bze + + (offs_bn[None, :] // 2) * stride_bzn + + offs_k_true * stride_bzk + ) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = (b_zp >> b_zp_shifter) & 0xF + b_zp = b_zp.to(tl.float32) + elif has_zp and use_int8_w8a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = ( + b_zp_ptr + + off_experts * stride_bze + + offs_bn[None, :] * stride_bzn + + offs_k_true * stride_bzk + ) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = b_zp.to(tl.float32) + + # We accumulate along the K dimension. + if has_zp: + b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type) + else: + b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type) + accumulator = tl.dot(a, b, acc=accumulator) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + if use_int4_w4a16: + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + else: + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load( + topk_weights_ptr + offs_token, mask=token_mask, other=0 + ) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + + silu_acc, mul_acc = ( + accumulator.to(tl.float32).reshape(BLOCK_SIZE_M, BLOCK_SIZE_HALF, 2).split() + ) + silu_acc = silu_acc / (1.0 + tl.exp2(-(silu_acc * 1.44269504089))) + accumulator = (silu_acc * mul_acc).to(compute_type) + + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_HALF + tl.arange(0, BLOCK_SIZE_HALF) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N // 2) + tl.store(c_ptrs, accumulator, mask=c_mask) + + tile_id += NUM_SMS + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + } +) +@triton.jit +def _fused_moe_silu_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + NUM_XCDS: tl.constexpr = 8 + + GRID_MN = num_pid_n * num_pid_m + if pid < GRID_MN: + pid = remap_xcd(pid, GRID_MN, NUM_XCDS) + else: + return # rest of the tiles are dummy paddings + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M) + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + _write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, + ) + return + + # silu ptrs + BLOCK_SIZE_HALF: tl.constexpr = BLOCK_SIZE_N // 2 + i = tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + # [0, 0, 1, 1, ..., BLOCK_SIZE_HALF - 1, BLOCK_SIZE_HALF - 1] + i_floor = i // 2 + offs_half = (pid_n * (BLOCK_SIZE_N // 2) + i_floor) % (N // 2) + # (i % 2): [0, 1, 0, 1,...] (alternating) + # (i % 2) * (N // 2) : [0, (N // 2), 0, (N // 2),...] + # So offs_bn now takes element from the first BLOCK_SIZE_HALF half and the second BLOCK_SIZE_HALF half in an alternating way (This allows us to do reshape without permute) + offs_bn = (offs_half + (i % 2) * (N // 2)) % N + + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + ) + if use_int8_w8a16: + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn + ) + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8: + if group_k > 0 and group_n > 0: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn + ) + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if EVEN_K: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + else: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0 + ) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + + # silu_and_mul + silu_acc, mul_acc = ( + accumulator.to(tl.float32).reshape(BLOCK_SIZE_M, BLOCK_SIZE_HALF, 2).split() + ) + silu_acc = silu_acc / (1.0 + tl.exp2(-(silu_acc * 1.44269504089))) + accumulator = (silu_acc * mul_acc).to(compute_type) + + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_HALF + tl.arange(0, BLOCK_SIZE_HALF) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N // 2) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % args["BLOCK_SIZE_K"] == 0, + } +) +@triton.jit +def _fused_moe_persistent_silu_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + EVEN_K: tl.constexpr, + NUM_SMS: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + This is the persistent version of the fused_moe kernel. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Simply compute how many iterations each persistent block needs to do + start_pid = tl.program_id(axis=0) + NUM_XCDS: tl.constexpr = 8 + # Load tile-invariant runtime constant + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + + num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + tile_id = start_pid + + offs_k = tl.arange(0, BLOCK_SIZE_K) + + num_tiles = num_pid_m * num_pid_n + + # Compute how many tiles are outside the padding region + num_valid_tiles = tl.cdiv((num_tiles - tile_id), NUM_SMS) + + for _ in range(0, num_valid_tiles): + tile_id_remapped = remap_xcd(tile_id, num_tiles, NUM_XCDS) + pid_m, pid_n = pid_grid(tile_id_remapped, num_pid_m, num_pid_n, GROUP_SIZE_M) + + # Compute the mask + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + + # silu ptrs + BLOCK_SIZE_HALF: tl.constexpr = BLOCK_SIZE_N // 2 + i = tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + # [0, 0, 1, 1, ..., BLOCK_SIZE_HALF - 1, BLOCK_SIZE_HALF - 1] + i_floor = i // 2 + offs_half = (pid_n * (BLOCK_SIZE_N // 2) + i_floor) % (N // 2) + # (i % 2): [0, 1, 0, 1,...] (alternating) + # (i % 2) * (N // 2) : [0, (N // 2), 0, (N // 2),...] + # So offs_bn now takes element from the first BLOCK_SIZE_HALF half and the second BLOCK_SIZE_HALF half in an alternating way (This allows us to do reshape without permute) + offs_bn = (offs_half + (i % 2) * (N // 2)) % N + + # Compute the A pointer + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + # Compute the B pointer + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + ) + + if use_int8_w8a16: + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn + ) + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8: + if group_k > 0 and group_n > 0: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn + ) + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + if EVEN_K: + a = tl.load(a_ptrs, mask=token_mask[:, None], other=0.0) + b = tl.load(b_ptrs) + else: + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load( + b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0 + ) + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0 + ) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load( + topk_weights_ptr + offs_token, mask=token_mask, other=0 + ) + accumulator = accumulator * moe_weight[:, None] + + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + + # silu_and_mul + silu_acc, mul_acc = ( + accumulator.to(tl.float32).reshape(BLOCK_SIZE_M, BLOCK_SIZE_HALF, 2).split() + ) + silu_acc = silu_acc / (1.0 + tl.exp2(-(silu_acc * 1.44269504089))) + accumulator = (silu_acc * mul_acc).to(compute_type) + + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_HALF + tl.arange(0, BLOCK_SIZE_HALF) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N // 2) + tl.store(c_ptrs, accumulator, mask=c_mask) + + # advance tile_id + tile_id += NUM_SMS + + +def fused_moe_silu( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: Optional[torch.Tensor], + B_zp: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + compute_type: tl.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + block_shape: Optional[List[int]] = None, + config: Optional[Dict[str, Any]] = None, +) -> None: + """ + #TODO: Add doc + """ + assert topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + if use_fp8_w8a8: + assert B_scale is not None + if block_shape is None: + output = torch.zeros(A.shape, device=A.device, dtype=torch.float8_e4m3fnuz) + A_scale = torch.zeros(1, device=A.device, dtype=torch.float32) + A, A_scale = _MOE_A_QUANT_FUNC(output, A, A_scale) + else: + # TODO: Add support for per token group quantization + assert len(block_shape) == 2 + block_n, block_k = block_shape[0], block_shape[1] + # A, A_scale = per_token_group_quant_fp8(A, block_k) + assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] + assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] + elif use_int8_w8a16 or use_int4_w4a16: + assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + + EM = sorted_token_ids.shape[0] + if A.shape[0] < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, so + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.shape[0], A.shape[0] * top_k * config["BLOCK_SIZE_M"]) + + if ( + (use_int8_w8a16 or use_int4_w4a16) + and block_shape is not None + and block_shape[1] > 0 + ): + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + if _USE_MOE_PERSISTENT_KERNEL: + NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count * 2 + grid = lambda META: ( # noqa: E731 + min( + NUM_SMS, + triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]), + ), + ) + + _fused_moe_persistent_silu_kernel_gptq_awq[grid]( + A, + B, + C, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1], + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(0), + C.stride(1), + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + block_k_diviable=A.shape[1] % config["BLOCK_SIZE_K"] == 0, + group_size=block_shape[1], + NUM_SMS=NUM_SMS, + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + **config, + ) + else: + grid = lambda META: ( # noqa: E731 + triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]), + ) + _fused_moe_silu_kernel_gptq_awq[grid]( + A, + B, + C, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1], + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(0), + C.stride(1), + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + block_k_diviable=A.shape[1] % config["BLOCK_SIZE_K"] == 0, + group_size=block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + **config, + ) + + else: + if _USE_MOE_PERSISTENT_KERNEL: + NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count * 2 + grid = lambda META: ( # noqa: E731 + min( + NUM_SMS, + triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]), + ), + ) + + _fused_moe_persistent_silu_kernel[grid]( + A, + B, + C, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1] - _PADDING_SIZE, + sorted_token_ids.shape[0], + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(0), + C.stride(1), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + NUM_SMS=NUM_SMS, + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + **config, + ) + else: + grid = lambda META: ( # noqa: E731 + triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]), + ) + _fused_moe_silu_kernel[grid]( + A, + B, + C, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.shape[1], + A.shape[1] - _PADDING_SIZE, + EM, + topk_ids.numel(), + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(0), + C.stride(1), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + **config, + ) diff --git a/aiter/ops/triton/moe_routing_sigmoid_top1_fused.py b/aiter/ops/triton/moe_routing_sigmoid_top1_fused.py new file mode 100644 index 0000000000000000000000000000000000000000..2c22b94aa1876ae3e91a76d0c6fae16430ab4fad --- /dev/null +++ b/aiter/ops/triton/moe_routing_sigmoid_top1_fused.py @@ -0,0 +1,182 @@ +# SPDX-License-Identifier: MIT + +from typing import Optional +import functools +import json +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH + + +@triton.jit +def _routing_sigmoid_top1_kernel( + X_ptr, + W_ptr, + topk_ids_ptr, + topk_weights_ptr, + M, + N, + K, + stride_xm, + stride_xk, + stride_wk, + stride_wn, + stride_topk_ids_m, + stride_topk_ids_n, + stride_topk_weights_m, + stride_topk_weights_n, + BLOCK_M: tl.constexpr, + BLOCK_K: tl.constexpr, + BLOCK_N: tl.constexpr, + TOPK: tl.constexpr, + FUSED_SHARED_EXPERTS: tl.constexpr, +): + # Program ID corresponds to the block index in M dimension + pid_m = tl.program_id(axis=0) + + # Offsets for the current block + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_K) + + _TOPK: tl.constexpr = TOPK + 1 if FUSED_SHARED_EXPERTS else TOPK + + offs_topk = tl.arange(0, _TOPK) + + # Masks for bounds checking + mask_m = offs_m < M + mask_n = offs_n < N + + # Initialize accumulator for matmul (will be in float32 due to default acc_type) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + # Loop over K dimension in chunks of BLOCK_K + for k in range(0, K, BLOCK_K): + # Compute pointers for A and B + offs_k_iter = k + offs_k + mask_k = offs_k_iter < K + + X_ptrs = X_ptr + ( + # pyre-ignore + offs_m[:, None] * stride_xm + + offs_k_iter[None, :] * stride_xk + ) + W_ptrs = W_ptr + ( + offs_k_iter[:, None] * stride_wk + offs_n[None, :] * stride_wn + ) + + # Load A and B tiles + # pyre-ignore + x = tl.load(X_ptrs, mask=(mask_m[:, None] & mask_k[None, :]), other=0.0) + w = tl.load(W_ptrs, mask=(mask_k[:, None] & mask_n[None, :]), other=0.0) + + # Compute partial matmul for the current block using FP16 inputs and FP32 accumulation + acc = tl.dot(x, w, acc=acc) + + acc = tl.sigmoid(acc) + # Get topk results + topk_ids = tl.argmax(acc, axis=1, tie_break_left=True) # Shape: (BLOCK_M,) + topk_weights = tl.max(acc, axis=1) # Shape: (BLOCK_M,) + + # Create buffers for results + topk_ids_buffer = tl.zeros((BLOCK_M, _TOPK), dtype=tl.int32) + topk_weights_buffer = tl.zeros((BLOCK_M, _TOPK), dtype=tl.float32) + + if FUSED_SHARED_EXPERTS: + # Set the first column with broadcasting + topk_ids_buffer = tl.where( + (offs_topk[None, :] < _TOPK - 1), topk_ids[:, None], N + ) + topk_weights_buffer = tl.where( + (offs_topk[None, :] < _TOPK - 1), topk_weights[:, None], 1.0 + ) + else: + topk_ids_buffer = topk_ids[:, None] + topk_weights_buffer = topk_weights[:, None] + + topk_ids_ptrs = ( + topk_ids_ptr + + offs_m[:, None] * stride_topk_ids_m + + offs_topk[None, :] * stride_topk_ids_n + ) + + topk_weights_ptrs = ( + topk_weights_ptr + + offs_m[:, None] * stride_topk_weights_m + + offs_topk[None, :] * stride_topk_weights_n + ) + + tl.store(topk_ids_ptrs, topk_ids_buffer) + tl.store(topk_weights_ptrs, topk_weights_buffer) + + +@functools.lru_cache(maxsize=1024) +def _get_config(M, N, K): + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/moe/{dev}-MOE_ROUTING_SIGMOID_TOPK1.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict = config + + n_key = "N16" if N <= 16 else "N128" + m_key = ( + "xlarge" + if M >= 8192 + else "large" if M >= 4096 else "medium" if M >= 2048 else "small" + ) + return _get_config._config_dict[n_key][m_key] + + +def routing_sigmoid_top1( + x, w, topk, fused_shared_experts=False, config: Optional[dict[str, any]] = None +): + x = x.view(-1, x.shape[-1]) + + assert topk == 1 + + # M: batch_size x seq_len, K: hidden_dim, N: num_experts + M, K = x.shape + Kb, N = w.shape + assert K == Kb + + _topk = topk + if fused_shared_experts: + _topk += 1 + + # Output tensor + topk_ids = torch.empty((M, _topk), device=x.device, dtype=torch.int32) + topk_weights = torch.empty((M, _topk), device=x.device, dtype=torch.float32) + + config = _get_config(M, N, K) + + # Grid size + def grid(META): + return (triton.cdiv(M, META["BLOCK_M"]),) + + _routing_sigmoid_top1_kernel[grid]( + x, + w, + topk_ids, + topk_weights, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + topk_ids.stride(0), + topk_ids.stride(1), + topk_weights.stride(0), + topk_weights.stride(1), + BLOCK_N=N, # Set BLOCK_N to N + TOPK=topk, + FUSED_SHARED_EXPERTS=fused_shared_experts, + **config, + ) + + return topk_ids, topk_weights diff --git a/aiter/ops/triton/norm.py b/aiter/ops/triton/norm.py new file mode 100644 index 0000000000000000000000000000000000000000..9619a4ac4e5f82b80bcb772924dcc745e50f42b0 --- /dev/null +++ b/aiter/ops/triton/norm.py @@ -0,0 +1,1354 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +from typing import Optional +from aiter.ops.triton.utils.types import get_dtype_max + + +@triton.jit +def _per_token_quant( + x, + row_max, + DTYPE_MAX: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + scale_out = row_max / DTYPE_MAX + scale_out = tl.where(scale_out == 0, 1.0, scale_out) + + scale_recip = 1 / scale_out + + qx = x * scale_recip + + return qx, scale_out + + +@triton.jit +def _layernorm_kernel( + # Pointers to matrices + x_ptr, + y_ptr, + w_ptr, + b_ptr, + mean_ptr, + rstd_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `x_row_stride` is + # how much to increase `x_ptr` by to get the element one row down. + x_row_stride, + y_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + eps, + # Meta-parameters + BLOCK_SIZE: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call layer_norm function + below + + Applies Layer Normalization over a mini-batch of inputs. + + Key parameters: + - X: The input tensor to be normalized with shape (M, N). + - Y: The output tensor with the same shape as the input one. + - W: The learnable weights tensor with shape (N, ). + - B: The learnable bias tensor with shape (N, ). + """ + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + x_ptr_start = x_ptr + (row * x_row_stride) + y_ptr_start = y_ptr + (row * y_row_stride) + + loop_num = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + + # Calculate mean + mean = 0 + _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load(x_ptr_start + col_offsets).to(tl.float32) # Unmasked loads + _mean += x_block + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load( + x_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ).to(tl.float32) + _mean += x_block + mean = tl.sum(_mean, axis=0) / n_cols + + # Calculate variance + _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load(x_ptr_start + col_offsets).to(tl.float32) # Unmasked loads + x_block = x_block - mean + _var += x_block * x_block + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load( + x_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ).to(tl.float32) + x_block = tl.where(col_offsets < n_cols, x_block - mean, 0.0) + _var += x_block * x_block + + var = tl.sum(_var, axis=0) / n_cols + rstd = tl.rsqrt(var + eps) + + # Write mean / rstd + tl.store(mean_ptr + row, mean) + tl.store(rstd_ptr + row, rstd) + + # Normalize and store + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + w_block = tl.load(w_ptr + col_offsets) + b_block = tl.load(b_ptr + col_offsets) + x_block = tl.load(x_ptr_start + col_offsets).to(tl.float32) + y_block = (x_block - mean) * rstd + y_block = y_block * w_block + b_block + tl.store(y_ptr_start + col_offsets, y_block) + + # For last iteration, do masked load and store + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + w_block = tl.load(w_ptr + col_offsets, mask=mask, other=0.0) + b_block = tl.load(b_ptr + col_offsets, mask=mask, other=0.0) + x_block = tl.load(x_ptr_start + col_offsets, mask=mask, other=0.0).to(tl.float32) + y_block = (x_block - mean) * rstd + y_block = y_block * w_block + b_block + tl.store(y_ptr_start + col_offsets, y_block, mask=mask) + + +@triton.jit +def _fused_add_layernorm_kernel( + # Pointers to matrices + x_ptr, + y_ptr, + res_in_ptr, + res_out_ptr, + w_ptr, + b_ptr, + mean_ptr, + rstd_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `x_row_stride` is + # how much to increase `x_ptr` by to get the element one row down. + x_row_stride, + y_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + eps, + # Meta-parameters + BLOCK_SIZE: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call layernorm2d_fwd_with_add function + below + + Performs an addition between two inputs and then applies Layer Normalization over + the addition result. + + Key parameters: + - X: The input tensor to be normalized with shape (M, N). + - Y: The output tensor with the same shape as the input one. + - Res_in: The tensor to be added to the X tensor with shape (M, N). + - Res_out: The tensor in which the addition result will be stored with shape (M, N). + - W: The learnable weights tensor with shape (N, ). + - B: The learnable bias tensor with shape (N, ). + """ + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + x_ptr_start = x_ptr + (row * x_row_stride) + y_ptr_start = y_ptr + (row * y_row_stride) + res_in_ptr_start = res_in_ptr + (row * x_row_stride) + res_out_ptr_start = res_out_ptr + (row * x_row_stride) + + loop_num = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + + # Calculate mean + mean = 0 + _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + _x_block = tl.load(x_ptr_start + col_offsets) # Unmasked loads + res_in_block = tl.load(res_in_ptr_start + col_offsets) + _x_block += res_in_block + tl.store(res_out_ptr_start + col_offsets, _x_block) # Stores residual_out + _mean += _x_block.to(tl.float32) + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + _x_block = tl.load(x_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0) + res_in_block = tl.load( + res_in_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ) + _x_block += res_in_block + tl.store( + res_out_ptr_start + col_offsets, _x_block, mask=col_offsets < n_cols + ) # Stores residual_out + _mean += _x_block.to(tl.float32) + mean = tl.sum(_mean, axis=0) / n_cols + + # Calculate variance + _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load(res_out_ptr_start + col_offsets).to( + tl.float32 + ) # Unmasked loads + x_block = x_block - mean + _var += x_block * x_block + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load( + res_out_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ).to(tl.float32) + x_block = tl.where(col_offsets < n_cols, x_block - mean, 0.0) + _var += x_block * x_block + + var = tl.sum(_var, axis=0) / n_cols + rstd = tl.rsqrt(var + eps) + + # Write mean / rstd + tl.store(mean_ptr + row, mean) + tl.store(rstd_ptr + row, rstd) + + # Normalize and store + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + w_block = tl.load(w_ptr + col_offsets) + b_block = tl.load(b_ptr + col_offsets) + x_block = tl.load(res_out_ptr_start + col_offsets).to(tl.float32) + y_block = (x_block - mean) * rstd + y_block = y_block * w_block + b_block + tl.store(y_ptr_start + col_offsets, y_block) + + # For last iteration, do masked load and store + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + w_block = tl.load(w_ptr + col_offsets, mask=mask, other=0.0) + b_block = tl.load(b_ptr + col_offsets, mask=mask, other=0.0) + x_block = tl.load(res_out_ptr_start + col_offsets, mask=mask, other=0.0).to( + tl.float32 + ) + y_block = (x_block - mean) * rstd + y_block = y_block * w_block + b_block + tl.store(y_ptr_start + col_offsets, y_block, mask=mask) + + +@triton.jit +def _quant_layernorm_kernel( + # Pointers to matrices + x_ptr, + y_ptr, + w_ptr, + b_ptr, + x_scale_ptr, + y_scale_ptr, + # Auxiliary tensor to store intermediate data + aux_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `x_row_stride` is + # how much to increase `x_ptr` by to get the element one row down. + x_row_stride, + y_row_stride, + aux_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + eps, + # Dtype max for quantization + DTYPE_MAX: tl.constexpr, + # Meta-parameters + IS_SMOOTH: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call layer_norm function + below + + Applies Layer Normalization over a mini-batch of inputs and quantizes the result. + + Key parameters: + - X: The input tensor to be normalized with shape (M, N). + - Y: The output tensor with the same shape as the input one. + - W: The learnable weights tensor with shape (N, ). + - B: The learnable bias tensor with shape (N, ). + - X_scale: The tensor to be multiplied by the LayerNorm output if IS_SMOOTH is true, with shape (n_cols, ). + - Y_scale: The tensor where the scale for each row will be stored with shape (n_rows, ). + """ + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + x_ptr_start = x_ptr + (row * x_row_stride) + y_ptr_start = y_ptr + (row * y_row_stride) + aux_ptr_start = aux_ptr + (row * aux_row_stride) + + loop_num = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + + # Calculate mean + mean = 0 + _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load(x_ptr_start + col_offsets).to(tl.float32) # Unmasked loads + _mean += x_block + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load( + x_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ).to(tl.float32) + _mean += x_block + mean = tl.sum(_mean, axis=0) / n_cols + + # Calculate variance + _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load(x_ptr_start + col_offsets).to(tl.float32) # Unmasked loads + x_block = x_block - mean + _var += x_block * x_block + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load( + x_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ).to(tl.float32) + x_block = tl.where(col_offsets < n_cols, x_block - mean, 0.0) + _var += x_block * x_block + + var = tl.sum(_var, axis=0) / n_cols + rstd = tl.rsqrt(var + eps) + + row_max: tl.float32 = 0.0 + + # Normalize and write output temporarily as fp32 + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + w_block = tl.load(w_ptr + col_offsets) + b_block = tl.load(b_ptr + col_offsets) + x_block = tl.load(x_ptr_start + col_offsets).to(tl.float32) + y_block = (x_block - mean) * rstd + y_block = y_block * w_block + b_block + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + col_offsets + x_scale = tl.load(x_scale_ptrs) + y_block *= x_scale + + # Computes the max value for each row + blk_max = tl.max(tl.abs(y_block), axis=-1) + row_max = max(row_max, blk_max) + + aux_ptrs = aux_ptr_start + col_offsets + tl.store(aux_ptrs, y_block) + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + w_block = tl.load(w_ptr + col_offsets, mask=mask, other=0.0) + b_block = tl.load(b_ptr + col_offsets, mask=mask, other=0.0) + x_block = tl.load(x_ptr_start + col_offsets, mask=mask, other=0.0).to(tl.float32) + y_block = tl.where(mask, (x_block - mean) * rstd, 0.0) + y_block = y_block * w_block + b_block + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + col_offsets + x_scale = tl.load(x_scale_ptrs, mask=mask, other=0.0) + y_block *= x_scale + + # Computes the max value for each row + blk_max = tl.max(tl.abs(y_block), axis=-1) + row_max = max(row_max, blk_max) + + tl.store(aux_ptr_start + col_offsets, y_block, mask=mask) + + # Apply quantization and write output + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + aux_block = tl.load(aux_ptr_start + col_offsets) # Unmasked loads + + y_block, _ = _per_token_quant(aux_block, row_max, DTYPE_MAX) + + tl.store(y_ptr_start + col_offsets, y_block.to(y_ptr.type.element_ty)) + + # For last iteration, do masked load and store + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + aux_block = tl.load(aux_ptr_start + col_offsets, mask=mask, other=0.0) + + y_block, y_scale = _per_token_quant(aux_block, row_max, DTYPE_MAX) + + # Store scale + tl.store(y_scale_ptr + row, y_scale.to(y_scale_ptr.type.element_ty)) + + tl.store(y_ptr_start + col_offsets, y_block.to(y_ptr.type.element_ty), mask=mask) + + +@triton.jit +def _quant_fused_add_layernorm_kernel( + # Pointers to matrices + x_ptr, + y_ptr, + res_in_ptr, + res_out_ptr, + w_ptr, + b_ptr, + x_scale_ptr, + y_scale_ptr, + # Auxiliary tensor to store intermediate data + aux_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `x_row_stride` is + # how much to increase `x_ptr` by to get the element one row down. + x_row_stride, + y_row_stride, + aux_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + eps, + # Dtype max for quantization + DTYPE_MAX: tl.constexpr, + # Meta-parameters + IS_SMOOTH: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call layernorm2d_fwd_with_add function + below + + Performs an addition between two inputs, applies Layer Normalization over the result and then quantizes it. + + Key parameters: + - X: The input tensor to be normalized with shape (M, N). + - Y: The output tensor with the same shape as the input one. + - Res_in: The tensor to be added to the X tensor with shape (M, N). + - Res_out: The tensor in which the addition result will be stored with shape (M, N). + - W: The learnable weights tensor with shape (N, ). + - B: The learnable bias tensor with shape (N, ). + - X_scale: The tensor to be multiplied by the LayerNorm output if IS_SMOOTH is true, with shape (n_cols, ). + - Y_scale: The tensor where the scale for each row will be stored with shape (n_rows, ). + """ + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + x_ptr_start = x_ptr + (row * x_row_stride) + y_ptr_start = y_ptr + (row * y_row_stride) + res_in_ptr_start = res_in_ptr + (row * x_row_stride) + res_out_ptr_start = res_out_ptr + (row * x_row_stride) + aux_ptr_start = aux_ptr + (row * aux_row_stride) + + loop_num = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + + # Calculate mean + mean = 0 + _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + _x_block = tl.load(x_ptr_start + col_offsets) # Unmasked loads + res_in_block = tl.load(res_in_ptr_start + col_offsets) + _x_block += res_in_block + tl.store(res_out_ptr_start + col_offsets, _x_block) # Stores residual_out + _mean += _x_block.to(tl.float32) + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + _x_block = tl.load(x_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0) + res_in_block = tl.load( + res_in_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ) + _x_block += res_in_block + tl.store( + res_out_ptr_start + col_offsets, _x_block, mask=col_offsets < n_cols + ) # Stores residual_out + _mean += _x_block.to(tl.float32) + mean = tl.sum(_mean, axis=0) / n_cols + + # Calculate variance + _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load(res_out_ptr_start + col_offsets).to( + tl.float32 + ) # Unmasked loads + x_block = x_block - mean + _var += x_block * x_block + + # For last iteration, do masked load + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_block = tl.load( + res_out_ptr_start + col_offsets, mask=col_offsets < n_cols, other=0.0 + ).to(tl.float32) + x_block = tl.where(col_offsets < n_cols, x_block - mean, 0.0) + _var += x_block * x_block + + var = tl.sum(_var, axis=0) / n_cols + rstd = tl.rsqrt(var + eps) + + row_max: tl.float32 = 0.0 + + # Normalize and write output temporarily as fp32 + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + w_block = tl.load(w_ptr + col_offsets) + b_block = tl.load(b_ptr + col_offsets) + x_block = tl.load(res_out_ptr_start + col_offsets).to(tl.float32) + y_block = (x_block - mean) * rstd + y_block = y_block * w_block + b_block + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + col_offsets + x_scale = tl.load(x_scale_ptrs) + y_block *= x_scale + + # Computes the max value for each row + blk_max = tl.max(tl.abs(y_block), axis=-1) + row_max = max(row_max, blk_max) + + aux_ptrs = aux_ptr_start + col_offsets + tl.store(aux_ptrs, y_block) + + # For last iteration, do masked load and store + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + w_block = tl.load(w_ptr + col_offsets, mask=mask, other=0.0) + b_block = tl.load(b_ptr + col_offsets, mask=mask, other=0.0) + x_block = tl.load(res_out_ptr_start + col_offsets, mask=mask, other=0.0).to( + tl.float32 + ) + y_block = tl.where(mask, (x_block - mean) * rstd, 0.0) + y_block = y_block * w_block + b_block + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + col_offsets + x_scale = tl.load(x_scale_ptrs, mask=mask, other=0.0) + y_block *= x_scale + + # Computes the max value for each row + blk_max = tl.max(tl.abs(y_block), axis=-1) + row_max = max(row_max, blk_max) + + tl.store(aux_ptr_start + col_offsets, y_block, mask=mask) + + # Apply quantization and write output + loop_num_l = loop_num + for b in range(0, loop_num_l): + col_offsets = b * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + aux_block = tl.load(aux_ptr_start + col_offsets) # Unmasked loads + + y_block, _ = _per_token_quant(aux_block, row_max, DTYPE_MAX) + + tl.store(y_ptr_start + col_offsets, y_block.to(y_ptr.type.element_ty)) + + # For last iteration, do masked load and store + col_offsets = loop_num_l * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = col_offsets < n_cols + aux_block = tl.load(aux_ptr_start + col_offsets, mask=mask, other=0.0) + + y_block, y_scale = _per_token_quant(aux_block, row_max, DTYPE_MAX) + + # Store scale + tl.store(y_scale_ptr + row, y_scale.to(y_scale_ptr.type.element_ty)) + + tl.store(y_ptr_start + col_offsets, y_block.to(y_ptr.type.element_ty), mask=mask) + + +def _layernorm_forward( + y: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + mean: torch.Tensor, + rstd: torch.Tensor, + eps: float = 1e-5, +): + + M, N = x.shape + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + + _layernorm_kernel[(M,)]( + x, y, weight, bias, mean, rstd, x.stride(0), y.stride(0), M, N, eps, BLOCK_SIZE + ) + + return + + +def _layernorm_forward_with_add( + y: torch.Tensor, + x: torch.Tensor, + res_in: torch.Tensor, + res_out: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + mean: torch.Tensor, + rstd: torch.Tensor, + epsilon: float, +): + + M, N = x.shape + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + + _fused_add_layernorm_kernel[(M,)]( + x, + y, + res_in, + res_out, + weight, + bias, + mean, + rstd, + x.stride(0), + y.stride(0), + M, + N, + epsilon, + BLOCK_SIZE, + ) + + return + + +@triton.jit +def _layernorm_bwd_dx_fused_triton( + DX, # pointer to the input gradient + DY, # pointer to the output gradient + DW, # pointer to the partial sum of weights gradient + DB, # pointer to the partial sum of biases gradient + X, # pointer to the input + W, # pointer to the weights + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride, # how much to increase the pointer when moving by 1 row + N, # number of columns in X + NUM_ROWS: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + USE_BLOCKED: tl.constexpr, + IGNORE_DW_DB: tl.constexpr = False, +): + # Map the program id to the elements of X, DX, and DY it should compute. + pid = tl.program_id(0) + tile_num = tl.num_programs(0) + rows_per_tile = NUM_ROWS // tile_num + if pid < NUM_ROWS % tile_num: + rows_per_tile += 1 + + if USE_BLOCKED: + # Blocked approach: + + col_offsets = tl.arange(0, BLOCK_SIZE_N) + num_col_blocks = tl.cdiv(N, BLOCK_SIZE_N) - 1 + row = pid + + for _ in range(0, rows_per_tile): + # Load row statistics: + mean = tl.load(Mean + row) + rstd = tl.load(Rstd + row) + + # Accumulate c1 and c2 sums: + + x_row_ptr = X + row * stride + dy_row_ptr = DY + row * stride + + c1 = 0.0 + c2 = 0.0 + + for block_idx in tl.range(0, num_col_blocks): + cols = block_idx * BLOCK_SIZE_N + col_offsets + + x = tl.load(x_row_ptr + cols).to(tl.float32) + dy = tl.load(dy_row_ptr + cols).to(tl.float32) + w = tl.load(W + cols).to(tl.float32) + + xhat = (x - mean) * rstd + wdy = w * dy + c1 += tl.sum(xhat * wdy, axis=0) + c2 += tl.sum(wdy, axis=0) + + cols = num_col_blocks * BLOCK_SIZE_N + col_offsets + mask = cols < N + + x = tl.load(x_row_ptr + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(dy_row_ptr + cols, mask=mask, other=0).to(tl.float32) + w = tl.load(W + cols, mask=mask, other=0).to(tl.float32) + + xhat = (x - mean) * rstd + wdy = w * dy + wdy = tl.where(mask, wdy, 0) + c1 += tl.sum(xhat * wdy, axis=0) + c2 += tl.sum(wdy, axis=0) + + c1 /= N + c2 /= N + + # Compute dx and partial sums for dw and db: + + dx_row_ptr = DX + row * stride + if not IGNORE_DW_DB: + dw_row_ptr = DW + pid * N + db_row_ptr = DB + pid * N + + for block_idx in tl.range(0, num_col_blocks): + cols = block_idx * BLOCK_SIZE_N + col_offsets + + x = tl.load(x_row_ptr + cols).to(tl.float32) + dy = tl.load(dy_row_ptr + cols).to(tl.float32) + w = tl.load(W + cols).to(tl.float32) + + xhat = (x - mean) * rstd + wdy = w * dy + + dx = (wdy - (xhat * c1 + c2)) * rstd + tl.store(dx_row_ptr + cols, dx.to(DX.type.element_ty)) + if not IGNORE_DW_DB: + partial_dw = dy * xhat + dw_ptrs = dw_row_ptr + cols + partial_dw += tl.load(dw_ptrs).to(tl.float32) + tl.store(dw_ptrs, partial_dw.to(DW.type.element_ty)) + + partial_db = dy + db_ptrs = db_row_ptr + cols + partial_db += tl.load(db_ptrs).to(tl.float32) + tl.store(db_ptrs, partial_db.to(DB.type.element_ty)) + + cols = num_col_blocks * BLOCK_SIZE_N + col_offsets + mask = cols < N + + x = tl.load(x_row_ptr + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(dy_row_ptr + cols, mask=mask, other=0).to(tl.float32) + w = tl.load(W + cols, mask=mask, other=0).to(tl.float32) + + xhat = (x - mean) * rstd + wdy = w * dy + + dx = (wdy - (xhat * c1 + c2)) * rstd + tl.store(dx_row_ptr + cols, dx.to(DX.type.element_ty), mask=mask) + if not IGNORE_DW_DB: + partial_dw = dy * xhat + dw_ptrs = dw_row_ptr + cols + partial_dw += tl.load(dw_ptrs, mask=mask).to(tl.float32) + tl.store(dw_ptrs, partial_dw.to(DW.type.element_ty), mask=mask) + + partial_db = dy + db_ptrs = db_row_ptr + cols + partial_db += tl.load(db_ptrs, mask=mask).to(tl.float32) + tl.store(db_ptrs, partial_db.to(DB.type.element_ty), mask=mask) + + # Advance to next row. + row += tile_num + + else: + # Unblocked approach: + + cols = tl.arange(0, BLOCK_SIZE_N) + mask = cols < N + row = pid + if not IGNORE_DW_DB: + dw_row = tl.zeros((BLOCK_SIZE_N,), dtype=tl.float32) + db_row = tl.zeros((BLOCK_SIZE_N,), dtype=tl.float32) + + for _ in range(0, rows_per_tile): + # Compute pointers: + x_ptrs = X + row * stride + dy_ptrs = DY + row * stride + dx_ptrs = DX + row * stride + + # Load data to SRAM: + x = tl.load(x_ptrs + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(dy_ptrs + cols, mask=mask, other=0).to(tl.float32) + w = tl.load(W + cols, mask=mask, other=0).to(tl.float32) + mean = tl.load(Mean + row) + rstd = tl.load(Rstd + row) + + # Compute dx: + xhat = (x - mean) * rstd + wdy = w * dy + wdy = tl.where(mask, wdy, 0) + c1 = tl.sum(xhat * wdy, axis=0) / N + c2 = tl.sum(wdy, axis=0) / N + dx = (wdy - (xhat * c1 + c2)) * rstd + + # Write dx: + tl.store(dx_ptrs + cols, dx.to(DX.type.element_ty), mask=mask) + if not IGNORE_DW_DB: + # Accumulate partial sums for dw and db: + dw_row += dy * xhat + db_row += dy + + # Advance to next row: + row += tile_num + if not IGNORE_DW_DB: + tl.store(DW + pid * N + cols, dw_row.to(DW.type.element_ty), mask=mask) + tl.store(DB + pid * N + cols, db_row.to(DB.type.element_ty), mask=mask) + + +@triton.jit +def _layernorm_bwd_dwdb_triton( + DW, # pointer to the partial sum of weights gradient + DB, # pointer to the partial sum of biases gradient + FINAL_DW, # pointer to the weights gradient + FINAL_DB, # pointer to the biases gradient + M, # GROUP_SIZE_M + N, # number of columns + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + # Map the program id to the elements of DW and DB it should compute. + pid = tl.program_id(0) + cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + # Iterate through the rows of DW and DB to sum the partial sums. + for i in range(0, M, BLOCK_SIZE_M): + rows = i + tl.arange(0, BLOCK_SIZE_M) + mask = (rows[:, None] < M) & (cols[None, :] < N) + offs = rows[:, None] * N + cols[None, :] + dw += tl.load(DW + offs, mask=mask, other=0.0) + db += tl.load(DB + offs, mask=mask, other=0.0) + # Write the final sum to the output. + sum_dw = tl.sum(dw, axis=0) + sum_db = tl.sum(db, axis=0) + tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.type.element_ty), mask=cols < N) + tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.type.element_ty), mask=cols < N) + + +@triton.jit +def _layernorm_bwd_dwdb_triton_v2( + X, # pointer to the input + DY, # pointer to the output gradient + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride, + FINAL_DW, # pointer to the weights gradient + FINAL_DB, # pointer to the biases gradient + M, # GROUP_SIZE_M + N, # number of columns + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + pid = tl.program_id(0) + cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + # Iterate through the rows of x and dy to compute dw and db + for i in range(0, M, BLOCK_SIZE_M): + rows = i + tl.arange(0, BLOCK_SIZE_M) + means = tl.load(Mean + rows, mask=rows < M, other=0.0).to(tl.float32) + rstds = tl.load(Rstd + rows, mask=rows < M, other=0.0).to(tl.float32) + mask = (rows[:, None] < M) & (cols[None, :] < N) + offs = rows[:, None] * stride + cols[None, :] + x = tl.load(X + offs, mask=mask, other=0.0).to(tl.float32) + dy = tl.load(DY + offs, mask=mask, other=0.0).to(tl.float32) + xhat = (x - means[:, None]) * rstds[:, None] + dw += dy * xhat + db += dy + # Write the final sum to the output. + sum_dw = tl.sum(dw, axis=0) + sum_db = tl.sum(db, axis=0) + tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.type.element_ty), mask=cols < N) + tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.type.element_ty), mask=cols < N) + + +def _layernorm_backward( + dy: torch.Tensor, + dx: torch.Tensor, + dw: torch.Tensor, + db: torch.Tensor, + x: torch.Tensor, + gamma: torch.Tensor, + mu: torch.Tensor, + rsigma: torch.Tensor, +): + + M, N = x.shape + # calculate dw and db separately when M is small + IGNORE_DW_DB_IN_FUSED = M <= 512 + tile_num = max(min(256, M // 4), 1) + if M <= 512 and M * N < 64 * 1024 * 1024: + tile_num = M + elif M >= 8192: + tile_num = 2048 + max_fused_size = 32768 // x.element_size() + next_power = triton.next_power_of_2(N) + BLOCK_SIZE = min(max_fused_size, next_power) + # For cases with small M and large N, decrease block size to help with occupancy and register spill + if tile_num == M: + if tile_num > 256: + BLOCK_SIZE = min(BLOCK_SIZE, 2048) + else: + BLOCK_SIZE = min(BLOCK_SIZE, 4096) + USE_BLOCKED = N > BLOCK_SIZE + num_warps = min(max(BLOCK_SIZE // 256, 1), 8) + + if not IGNORE_DW_DB_IN_FUSED: + _dw = torch.zeros((tile_num, N), dtype=torch.float32, device=gamma.device) + _db = torch.zeros((tile_num, N), dtype=torch.float32, device=gamma.device) + else: + _dw = None + _db = None + + grid_bwd = (tile_num,) + _layernorm_bwd_dx_fused_triton[grid_bwd]( + dx, + dy, + _dw, + _db, + x, + gamma, + mu, + rsigma, + x.stride(0), + N, + NUM_ROWS=M, + BLOCK_SIZE_N=BLOCK_SIZE, + USE_BLOCKED=USE_BLOCKED, + num_warps=num_warps, + IGNORE_DW_DB=IGNORE_DW_DB_IN_FUSED, + ) + grid_reduce = lambda meta: (triton.cdiv(N, meta["BLOCK_SIZE_N"]),) # noqa: E731 + if not IGNORE_DW_DB_IN_FUSED: + dwdb_block_n = max(16, N // 256) + dwdb_block_n = triton.next_power_of_2(dwdb_block_n) + dwdb_block_m = (64 * 128) // dwdb_block_n + dwdb_block_m = min(triton.next_power_of_2(tile_num), dwdb_block_m) + _layernorm_bwd_dwdb_triton[grid_reduce]( + _dw, + _db, + dw, + db, + min(tile_num, M), + N, + BLOCK_SIZE_M=dwdb_block_m, + BLOCK_SIZE_N=dwdb_block_n, + ) + else: + dwdb_block_n = max(16, N // 256) + dwdb_block_n = triton.next_power_of_2(dwdb_block_n) + dwdb_block_m = (64 * 128) // dwdb_block_n + dwdb_block_m = min(triton.next_power_of_2(M), dwdb_block_m) + _layernorm_bwd_dwdb_triton_v2[grid_reduce]( + x, + dy, + mu, + rsigma, + x.stride(0), + dw, + db, + M, + N, + BLOCK_SIZE_M=dwdb_block_m, + BLOCK_SIZE_N=dwdb_block_n, + ) + + return dx, dw, db + + +class _LayerNorm(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, weight, bias, eps, is_grad_enabled): + + is_grad = is_grad_enabled and any( + tensor.requires_grad for tensor in [x, weight, bias] + ) + + y = torch.empty_like(x) + M = x.shape[0] + mean = torch.empty((M,), dtype=torch.float32, device=x.device) + rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + + _layernorm_forward(y, x, weight, bias, mean, rstd, eps) + + if is_grad: + ctx.save_for_backward(x, weight, mean, rstd) + + return y + + @staticmethod + def backward(ctx, dy): + x, w, m, v = ctx.saved_tensors + N = w.shape[0] + + dw = torch.empty((N,), dtype=w.dtype, device=w.device) + db = torch.empty((N,), dtype=w.dtype, device=w.device) + dx = torch.empty_like(dy) + + _layernorm_backward(dy, dx, dw, db, x, w, m, v) + + return dx, dw, db, None, None + + +class _Layernorm2dFwdWithAdd(torch.autograd.Function): + + @staticmethod + def forward(ctx, y, x, res_in, res_out, weight, bias, eps, is_grad_enabled): + + is_grad = is_grad_enabled and any( + tensor.requires_grad for tensor in [x, weight, bias] + ) + + M = x.shape[0] + mean = torch.empty((M,), dtype=torch.float32, device=x.device) + rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + + _layernorm_forward_with_add( + y, x, res_in, res_out, weight, bias, mean, rstd, eps + ) + + if is_grad: + ctx.save_for_backward(res_out, weight, mean, rstd) + + return y + + @staticmethod + def backward(ctx, dy): + x, w, m, v = ctx.saved_tensors + N = w.shape[0] + + dw = torch.empty((N,), dtype=w.dtype, device=w.device) + db = torch.empty((N,), dtype=w.dtype, device=w.device) + dx = torch.empty_like(dy) + + _layernorm_backward(dy, dx, dw, db, x, w, m, v) + + return None, dx, None, None, dw, db, None, None + + +def layer_norm( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float = 1e-5, + x_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Applies Layer Normalization over a mini-batch of inputs. + + Key parameters: + - input: The input tensor to be normalized with shape (M, N). + - weight: The learnable weights tensor with shape (N, ). + - bias: The learnable bias tensor with shape (N, ) + - eps: A value added to the denominator for numerical stability. + + Returns: + - Output: The output tensor with shape (M, N). + """ + return _LayerNorm.apply(input, weight, bias, eps, torch.is_grad_enabled()) + + +def layernorm2d_fwd_with_add( + out: torch.Tensor, + input: torch.Tensor, + residual_in: torch.Tensor, + residual_out: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + epsilon: float, + x_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Adds two inputs and then applies Layer Normalization + + Key parameters: + - out: The output of layer normalization with shape (M, N). Allocated by the caller + - input: The input tensor to be normalized with shape (M, N). + - residual_in: Tensor added to the input and same shape as input (M, N) + - residual_out: Output tensor that is input + residual_in with shape (M, N). Must be allocated by the caller + - weight: The learnable weights tensor with shape (N, ). + - bias: Bias added to the result of layer norm with shape (N,) + - epsilon: A value added to the denominator for numerical stability. + + Returns: + - out: The output tensor with shape (M, N). + - residual_out: Output tensor that is input + residual_in with shape (M, N). + """ + return _Layernorm2dFwdWithAdd.apply( + out, + input, + residual_in, + residual_out, + weight, + bias, + epsilon, + torch.is_grad_enabled(), + ) + + +def layernorm2d_fwd_with_dynamicquant( + out: torch.Tensor, + input: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + epsilon: float = 1e-5, + x_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Applies Layer Normalization and then quantizes the output + + Key parameters: + - out: The output of layer normalization with shape (M, N). Allocated by the caller + - input: The input tensor to be normalized with shape (M, N) and dtype in (fp32, fp16 or bf16) + - yscale: Output scale tensor with shape (M,) and dtype fp32. Allocated by the caller + - weight: The learnable weights tensor with shape (N, ). + - bias: Bias added to the result of layer norm with shape (N,) + - eps: A value added to the denominator for numerical stability. + + Returns: + - out: The output tensor with shape (M, N). + - yscale: Output scale tensor with shape (M,). Allocated by the caller + """ + M, N = input.shape + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // input.element_size() + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + + xscale = None + IS_SMOOTH = False + DTYPE_MAX = get_dtype_max(out.dtype) + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = torch.empty(M, N, dtype=torch.float32, device=input.device) + + _quant_layernorm_kernel[(M,)]( + input, + out, + weight, + bias, + xscale, + yscale, + aux, + input.stride(0), + out.stride(0), + aux.stride(0), + M, + N, + epsilon, + DTYPE_MAX, + IS_SMOOTH, + BLOCK_SIZE, + ) + + return + + +def layernorm2d_fwd_with_smoothquant( + out: torch.Tensor, + input: torch.Tensor, + xscale: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + epsilon: float = 1e-5, + x_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Applies Layer Normalization and then quantizes the output + + Key parameters: + - input: The input tensor to be normalized with shape (M, N). + - xscale: Input scale tensor which is multiplied with the output of layer normalization before quantization. + - yscale: Output scale tensor with shape (M,) and dtype fp32. Allocated by the caller + - weight: The learnable weights tensor with shape (N, ). + - bias: Bias added to the result of layer norm with shape (N,) + - eps: A value added to the denominator for numerical stability. + + Returns: + - Output: The output tensor with shape (M, N). + """ + M, N = input.shape + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // input.element_size() + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + + IS_SMOOTH = True + DTYPE_MAX = get_dtype_max(out.dtype) + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = torch.empty(M, N, dtype=torch.float32, device=input.device) + + _quant_layernorm_kernel[(M,)]( + input, + out, + weight, + bias, + xscale, + yscale, + aux, + input.stride(0), + out.stride(0), + aux.stride(0), + M, + N, + epsilon, + DTYPE_MAX, + IS_SMOOTH, + BLOCK_SIZE, + ) + + return + + +def layernorm2d_fwd_with_add_dynamicquant( + out: torch.Tensor, + input: torch.Tensor, + residual_in: torch.Tensor, + residual_out: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + epsilon: float = 1e-5, + x_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Adds two input toegether, then does layer Normalization before quantizing the final output + + Key parameters: + - out: The output of layer normalization with shape (M, N). Allocated by the caller + - input: The input tensor to be normalized with shape (M, N) and dtype in (fp32, fp16 or bf16) + - residual_in: Tensor added to the input and same shape as input (M, N) + - residual_out: Output tensor that is input + residual_in with shape (M, N). Must be allocated by the caller + - yscale: Output scale tensor with shape (M,) and dtype fp32. Allocated by the caller + - weight: The learnable weights tensor with shape (N, ). + - bias: Bias added to the result of layer norm with shape (N,) + - eps: A value added to the denominator for numerical stability. + + Returns: + - out: The output tensor with shape (M, N). + - yscale: Output scale tensor with shape (M,). Allocated by the caller + """ + M, N = input.shape + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // input.element_size() + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + + xscale = None + IS_SMOOTH = False + DTYPE_MAX = get_dtype_max(out.dtype) + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = torch.empty(M, N, dtype=torch.float32, device=input.device) + + _quant_fused_add_layernorm_kernel[(M,)]( + input, + out, + residual_in, + residual_out, + weight, + bias, + xscale, + yscale, + aux, + input.stride(0), + out.stride(0), + aux.stride(0), + M, + N, + epsilon, + DTYPE_MAX, + IS_SMOOTH, + BLOCK_SIZE, + ) + + return + + +def layernorm2d_fwd_with_add_smoothquant( + out: torch.Tensor, + input: torch.Tensor, + residual_in: torch.Tensor, + residual_out: torch.Tensor, + xscale: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + epsilon: float = 1e-5, + x_bias: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Applies Layer Normalization and then quantizes the output + + Key parameters: + - input: The input tensor to be normalized with shape (M, N). + - residual_in: Tensor added to the input and same shape as input (M, N) + - residual_out: Output tensor that is input + residual_in with shape (M, N). Must be allocated by the caller + - xscale: Input scale tensor which is multiplied with the output of layer normalization before quantization. + - yscale: Output scale tensor with shape (M,) and dtype fp32. Allocated by the caller + - weight: The learnable weights tensor with shape (N, ). + - bias: Bias added to the result of layer norm with shape (N,) + - eps: A value added to the denominator for numerical stability. + + Returns: + - Output: The output tensor with shape (M, N). + - yscale: Output scale tensor with shape (M,). Allocated by the caller + """ + + M, N = input.shape + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // input.element_size() + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + + IS_SMOOTH = True + DTYPE_MAX = get_dtype_max(out.dtype) + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = torch.empty(M, N, dtype=torch.float32, device=input.device) + + _quant_fused_add_layernorm_kernel[(M,)]( + input, + out, + residual_in, + residual_out, + weight, + bias, + xscale, + yscale, + aux, + input.stride(0), + out.stride(0), + aux.stride(0), + M, + N, + epsilon, + DTYPE_MAX, + IS_SMOOTH, + BLOCK_SIZE, + ) + + return diff --git a/aiter/ops/triton/pa_decode.py b/aiter/ops/triton/pa_decode.py new file mode 100644 index 0000000000000000000000000000000000000000..6d807c345194ce97e9808749bd072944a99258ba --- /dev/null +++ b/aiter/ops/triton/pa_decode.py @@ -0,0 +1,2284 @@ +# SPDX-License-Identifier: MIT + +import math +from typing import Optional + +import triton +import triton.language as tl +import torch + +# This code is derived from sglang and FLASHNN projects +# https://github.com/AlibabaPAI/FLASHNN/blob/main/flashnn/triton_kernels/paged_attn.py + +_SEQ_PARTITION_SIZE = 1024 # HIP + + +def paged_attention_decode( + output: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz] + query: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz] + key_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + value_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + seq_lens: torch.Tensor, # [num_seqs] + block_tables: torch.Tensor, # [num_seqs, max_num_blks_per_seq] + attn_scale: float, + max_seq_len: int, + compute_type, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + num_seq_partitions: int = 0, # TODO use this below + alibi_slopes: torch.Tensor = None, +) -> None: + """ + #TODO: Add Doc + """ + + # get num_seqs, num_kv_heads, kv_blk_sz, head_sz and query_grp_sz + num_seqs = query.shape[0] + num_q_heads = query.shape[1] + num_kv_heads = key_cache.shape[1] + + max_num_partitions = (max_seq_len + _SEQ_PARTITION_SIZE - 1) // _SEQ_PARTITION_SIZE + + use_v1 = max_seq_len <= 8192 and ( + max_num_partitions == 1 or num_seqs * num_q_heads > 512 + ) + if k_scale.numel() > 1: + if use_v1: + paged_attn_decode_v1_per_token_quant( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + max_seq_len, + compute_type, + num_kv_heads, + attn_scale, + alibi_slopes, + k_scale, + v_scale, + ) + else: + paged_attn_decode_v2_per_token_quant( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + max_seq_len, + compute_type, + num_kv_heads, + attn_scale, + alibi_slopes, + k_scale, + v_scale, + max_num_partitions, + ) + else: + if use_v1: + paged_attn_decode_v1( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + max_seq_len, + compute_type, + num_kv_heads, + attn_scale, + alibi_slopes, + k_scale.item(), + v_scale.item(), + ) + else: + paged_attn_decode_v2( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + max_seq_len, + compute_type, + num_kv_heads, + attn_scale, + alibi_slopes, + k_scale.item(), + v_scale.item(), + max_num_partitions, + ) + + +def paged_attn_decode_v1( + output: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz] + query: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz] + key_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + value_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + block_tables: torch.Tensor, # [num_seqs, max_num_blks_per_seq] + seq_lens: torch.Tensor, # [num_seqs] + max_seq_len: int, + compute_type, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +): + """ + #TODO: Add Doc + """ + + num_seqs = query.shape[0] + num_q_heads = query.shape[1] + kv_blk_sz = key_cache.shape[2] + head_sz = key_cache.shape[3] + query_grp_sz = query.shape[1] // num_kv_heads + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + kv_blk_sz_pow2 = triton.next_power_of_2(kv_blk_sz) + head_sz_pow2 = triton.next_power_of_2(head_sz) + + # MHA- Multi-Head Attention + if query_grp_sz == 1: + grid = (num_q_heads, num_seqs, 1) + _paged_attn_decode_v1_wo_dot_kernel[grid]( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + alibi_slopes, + scale, + k_scale, + v_scale, + query.stride(0), + query.stride(1), + output.stride(0), + output.stride(1), + output.stride(2), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + block_tables.stride(0), + compute_type=compute_type, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz_pow2, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + MAX_SEQ_LEN_POW2=max_seq_len, + ) + # GQA - Grouped Query Attention + else: + grid = (num_seqs, num_kv_heads, 1) + if query_grp_sz <= 16: + query_grp_sz_pow2 = 16 + else: + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + _paged_attn_decode_v1_w_dot_kernel[grid]( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + alibi_slopes, + scale, + k_scale, + v_scale, + output.stride(0), + output.stride(1), + output.stride(2), + query.stride(0), + query.stride(1), + query.stride(2), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + key_cache.stride(3), + block_tables.stride(0), + block_tables.stride(1), + compute_type=compute_type, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + QUERY_GRP_SZ_POW2=query_grp_sz_pow2, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz, + ) + + +@triton.jit +def _paged_attn_decode_v1_wo_dot_kernel( + out, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + q_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + k_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + v_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + blk_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_q_heads] + scale, + k_scale, + v_scale, + stride_q_s, + stride_q_h, + stride_o_s, + stride_o_nh, + stride_o_hs, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_bt_s, + compute_type: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + MAX_SEQ_LEN_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + head_idx = tl.program_id(axis=0) + seq_idx = tl.program_id(axis=1) + kv_head_idx = head_idx // QUERY_GRP_SZ + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + num_kv_blks = tl.cdiv(seq_len, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + + # load alibi slopes [1] + if alibi_slopes_ptr is None: + alibi_slope = 0.0 + else: + alibi_slope = tl.load(alibi_slopes_ptr + head_idx) + + # load q [1, HEAD_SZ_POW2] + q_offs = seq_idx * stride_q_s + head_idx * stride_q_h + head_sz_offs + q = tl.load(q_ptr + q_offs, mask=head_sz_offs < HEAD_SZ) + q = (q * scale).to(compute_type) + + acc = tl.zeros([KV_BLK_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = float("-inf") + exp_sum = 0.0 + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] + ) + blk_tbl_start_ptr = blk_tables_ptr + seq_idx * stride_bt_s + + for b in range(num_kv_blks): + kv_blk_nums = tl.load(blk_tbl_start_ptr + b) + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = b * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + + # load k [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + if k_0.dtype.is_fp8(): + k = k_0.to(tl.float32) * k_scale + else: + k = k_0 + k = k.to(compute_type) + + # qk #[KV_BLK_SZ_POW2] + qk = tl.sum( + (q[None, :] * k).to(tl.float32), axis=1 + ) # [1, HEAD_SZ_POW2] * [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + + if alibi_slopes_ptr is not None: + qk += (alibi_slope * (blk_seq_offs - seq_len + 1)).to(tl.float32) + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + + max_logit_new = tl.maximum(tl.max(qk, axis=0), max_logit) + + # p: [KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # load v [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask) + if v_0.dtype.is_fp8(): + v = v_0.to(tl.float32) * v_scale + else: + v = v_0 + v = v.to(compute_type) + + acc += p[:, None] * v + + exp_sum = exp_sum * alpha + tl.sum(p, axis=0) + max_logit = max_logit_new + + acc = acc / exp_sum + + offs_out = seq_idx * stride_o_s + head_idx * stride_o_nh + head_sz_offs + out_mask = head_sz_offs < HEAD_SZ + tl.store( + out + offs_out, tl.sum(acc, axis=0).to(out.dtype.element_ty), mask=out_mask + ) + + +@triton.jit +def _paged_attn_decode_v1_w_dot_kernel( + out_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + q_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + k_cache_ptr, # [num_blocks, num_kv_heads, kv_blk_sz, head_sz] + v_cache_ptr, # [num_blocks, num_kv_heads, kv_blk_sz, head_sz] + blk_tables_ptr, # [num_seqs, max_num_blks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes, # [num_kv_heads*query_grp_sz] + scale, + k_scale, + v_scale, + stride_o_s, + stride_o_nh, + stride_o_hs, + stride_q_s, + stride_q_nh, + stride_q_hs, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_k_hs, + stride_bt_s, + stride_bt_nb, + compute_type: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + QUERY_GRP_SZ_POW2: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + num_kv_blks = tl.cdiv(seq_len, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + q_grp_offs = tl.arange(0, QUERY_GRP_SZ_POW2) + + # load alibi slopes[QUERY_GRP_SZ_POW2] + if alibi_slopes is None: + alibi_slope = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + else: + alibi_slope = tl.load( + alibi_slopes + kv_head_idx * QUERY_GRP_SZ + q_grp_offs, + mask=q_grp_offs < QUERY_GRP_SZ, + other=0.0, + ) + + q_offs = ( + seq_idx * stride_q_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_q_nh + + head_sz_offs[None, :] * stride_q_hs + ) + + # load q[QUERY_GRP_SZ_POW2, HEAD_SZ_POW2] + q_mask = (q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_sz_offs[None, :] < HEAD_SZ) + + q = tl.load(q_ptr + q_offs, mask=q_mask, other=0.0) + q = (q * scale).to(compute_type) + + acc = tl.zeros([QUERY_GRP_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + float("-inf") + exp_sum = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] * stride_k_hs + ) + blk_tbl_start_ptr = blk_tables_ptr + seq_idx * stride_bt_s + + for b in range(num_kv_blks): + kv_blk_nums = tl.load(blk_tbl_start_ptr + b) + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = b * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + + # load k[KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + k = k_0.to(tl.float32) * k_scale if k_0.dtype.is_fp8() else k_0 + k = k.to(compute_type) + + # qk: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + qk = tl.dot(q, k.T, out_dtype=tl.float32) + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + + if alibi_slopes is not None: + qk += (alibi_slope[:, None] * (blk_seq_offs - seq_len + 1)[None, :]).to( + tl.float32 + ) + + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + max_logit_new = tl.maximum(tl.max(qk, axis=1), max_logit) + + # p: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new[:, None]) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # v: [KV_BLK_SZ, HEAD_SZ] + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + v = v_0.to(tl.float32) * v_scale if v_0.dtype.is_fp8() else v_0 + v = v.to(compute_type) + + p = p.to(v.dtype) + acc += tl.dot(p, v, out_dtype=tl.float32) + + exp_sum = exp_sum * alpha + tl.sum(p, axis=1) + max_logit = max_logit_new + + acc = acc / exp_sum[:, None] + + out_offs = ( + seq_idx * stride_o_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_o_nh + + head_sz_offs[None, :] + ) + + out_mask = (q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_sz_offs[None, :] < HEAD_SZ) + tl.store(out_ptr + out_offs, acc.to(out_ptr.dtype.element_ty), mask=out_mask) + + +def paged_attn_decode_v2( + output: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz], + query: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz], + key_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] , + value_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] , + block_tables: torch.Tensor, # [num_seqs, max_num_blks_per_seq], + seq_lens: torch.Tensor, # [num_seqs], + max_seq_len: int, + compute_type, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: float, + v_scale: float, + max_num_partitions: int, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +): + """ + #TODO: Add Doc + """ + + num_seqs = query.shape[0] + num_q_heads = query.shape[1] + kv_blk_sz = key_cache.shape[2] + head_sz = key_cache.shape[3] + query_grp_sz = num_q_heads // num_kv_heads + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + + # Note: There is a bug in triton.next_power_of_2 function which causes it + # to update the passed in arg, so that's why we have a workaround here + # max_num_partitions_pow2 = triton.next_power_of_2(max_num_partitions) + if max_num_partitions == 0: + max_num_partitions_pow2 = 1 + else: + max_num_partitions_pow2 = 2 ** math.ceil(math.log2(max_num_partitions)) + + kv_blk_sz_pow2 = triton.next_power_of_2(kv_blk_sz) + head_sz_pow2 = triton.next_power_of_2(head_sz) + + # MHA + if query_grp_sz == 1: + grid = (num_q_heads, num_seqs, max_num_partitions) + shape_info = (num_seqs, num_q_heads, max_num_partitions) + exp_sums = torch.empty( + size=shape_info, dtype=torch.float32, device=output.device + ) + max_logits = torch.empty( + size=shape_info, dtype=torch.float32, device=output.device + ) + tmp_output = torch.empty( + (*shape_info, head_sz), dtype=output.dtype, device=output.device + ) + _paged_attn_decode_v2_wo_dot_kernel[grid]( + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + scale, + k_scale, + v_scale, + alibi_slopes, + exp_sums.stride(0), + exp_sums.stride(1), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + query.stride(0), + query.stride(1), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + block_tables.stride(0), + block_tables.stride(1), + compute_type=compute_type, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz_pow2, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + MAX_NUM_BLKS_PER_SEQ=block_tables.shape[1], + MAX_SEQ_LEN_POW2=max_seq_len, + ) + grid = (num_q_heads, num_seqs, 1) + _paged_attn_decode_v2_wo_dot_reduce_kernel[grid]( + output, + exp_sums, + max_logits, + tmp_output, + seq_lens, + output.stride(0), + output.stride(1), + exp_sums.stride(0), + exp_sums.stride(1), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + MAX_NUM_SEQ_PARTITIONS=int(max_num_partitions), + MAX_NUM_SEQ_PARTITIONS_POW2=int(max_num_partitions_pow2), + ) + # GQA + else: + grid = (num_seqs, num_kv_heads, max_num_partitions) + shape_info = (num_seqs, num_kv_heads, max_num_partitions, query_grp_sz) + max_logits = torch.empty(shape_info, dtype=torch.float32, device=output.device) + exp_sums = torch.empty(shape_info, dtype=torch.float32, device=output.device) + tmp_output = torch.empty( + *shape_info, head_sz, dtype=output.dtype, device=output.device + ) + if query_grp_sz <= 16: + query_grp_sz_pow2 = 16 + else: + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + _paged_attn_decode_v2_w_dot_kernel[grid]( + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + scale, + k_scale, + v_scale, + alibi_slopes, + exp_sums.stride(0), + exp_sums.stride(1), + exp_sums.stride(2), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + tmp_output.stride(3), + query.stride(0), + query.stride(1), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + block_tables.stride(0), + compute_type=compute_type, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + QUERY_GRP_SZ_POW2=query_grp_sz_pow2, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz_pow2, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + ) + grid = (num_seqs, num_kv_heads, 1) + _paged_attn_decode_v2_w_dot_reduce_kernel[grid]( + output, + exp_sums, + max_logits, + tmp_output, + seq_lens, + output.stride(0), + output.stride(1), + exp_sums.stride(0), + exp_sums.stride(1), + exp_sums.stride(2), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + tmp_output.stride(3), + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + QUERY_GRP_SZ_POW2=query_grp_sz_pow2, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + MAX_NUM_SEQ_PARTITIONS=int(max_num_partitions), + MAX_NUM_SEQ_PARTITIONS_POW2=int(triton.next_power_of_2(max_num_partitions)), + ) + + +@triton.jit +def _paged_attn_decode_v2_wo_dot_kernel( + exp_sums_ptr, + max_logits_ptr, + logits_ptr, + q_ptr, + k_cache_ptr, + v_cache_ptr, + blk_tables_ptr, + seq_lens_ptr, + scale, + k_scale, + v_scale, + alibi_slopes, + stride_exp_s, + stride_exp_h, + stride_logits_s, + stride_logits_h, + stride_logits_p, + stride_q_s, + stride_q_h, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_bt_s, + stride_bt_nb, + compute_type: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, + MAX_NUM_BLKS_PER_SEQ: tl.constexpr, + MAX_SEQ_LEN_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + head_idx = tl.program_id(0) + seq_idx = tl.program_id(1) + seq_part_idx = tl.program_id(2) + kv_head_idx = head_idx // QUERY_GRP_SZ + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + if seq_part_idx * SEQ_PARTITION_SZ >= seq_len: + return + + seq_start_idx = seq_part_idx * SEQ_PARTITION_SZ + seq_end_idx = tl.minimum(seq_start_idx + SEQ_PARTITION_SZ, seq_len) + num_kv_blks = tl.cdiv(seq_end_idx - seq_start_idx, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + + # load alibi slopes + if alibi_slopes is None: + alibi_slope = 0.0 + else: + alibi_slope = tl.load(alibi_slopes + head_idx) + + # load q[HEAD_SZ] + q_offs = seq_idx * stride_q_s + head_idx * stride_q_h + head_sz_offs + q = tl.load(q_ptr + q_offs, mask=head_sz_offs < HEAD_SZ) + q = (q * scale).to(compute_type) + + acc = tl.zeros([KV_BLK_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = float("-inf") + exp_sum = 0.0 + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] + ) + kv_blk_start = seq_part_idx * (SEQ_PARTITION_SZ // KV_BLK_SZ) + blk_tables_start_ptr = blk_tables_ptr + seq_idx * stride_bt_s + + for b in range(num_kv_blks): + kv_blk_idx = kv_blk_start + b + kv_blk_nums = tl.load(blk_tables_start_ptr + kv_blk_idx * stride_bt_nb) + + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = kv_blk_idx * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + + # load k[KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + k = k_0.to(tl.float32) * k_scale if k_0.dtype.is_fp8() else k_0 + k = k.to(compute_type) + + # qk: [KV_BLK_SZ_POW2] + qk = tl.sum((q[None, :] * k).to(tl.float32), axis=1) + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + + if alibi_slopes is not None: + qk += (alibi_slope * (blk_seq_offs - seq_len + 1)).to(tl.float32) + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + + max_logit_new = tl.maximum(max_logit, tl.max(qk, axis=0)) + + # p: [KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # v: [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + v = v_0.to(tl.float32) * v_scale if v_0.dtype.is_fp8() else v_0 + v = v.to(compute_type) + + # acc: [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + acc += p[:, None] * v + + exp_sum = exp_sum * alpha + tl.sum(p, axis=0) + max_logit = max_logit_new + + acc = acc / exp_sum + + max_logits_offs = seq_idx * stride_exp_s + head_idx * stride_exp_h + seq_part_idx + + tl.store(max_logits_ptr + max_logits_offs, max_logit) + tl.store(exp_sums_ptr + max_logits_offs, exp_sum) + + logits_offs = ( + seq_idx * stride_logits_s + + head_idx * stride_logits_h + + seq_part_idx * stride_logits_p + + head_sz_offs + ) + logits_mask = head_sz_offs < HEAD_SZ + tl.store( + logits_ptr + logits_offs, + tl.sum(acc, axis=0).to(logits_ptr.dtype.element_ty), + mask=logits_mask, + ) + + +@triton.jit +def _paged_attn_decode_v2_wo_dot_reduce_kernel( + out, + exp_sums_ptr, + max_logits_ptr, + logits_ptr, + seq_lens, + stride_out_n, + stride_out_h, + stride_exp_sums_n, + stride_exp_sums_h, + stride_logits_n, + stride_logits_h, + stride_logits_b, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + # get seq_idx, head_idx, seq_len + head_idx = tl.program_id(axis=0) + seq_idx = tl.program_id(axis=1) + + seq_len = tl.load(seq_lens + seq_idx) + num_partitions = tl.cdiv(seq_len, SEQ_PARTITION_SZ) + + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + seq_part_offs = tl.arange(0, MAX_NUM_SEQ_PARTITIONS_POW2) + + max_logit = float("-inf") + acc = tl.zeros([HEAD_SZ_POW2], dtype=tl.float32) + global_exp_sum = tl.zeros([1], dtype=tl.float32) + + # load max_logits [MAX_NUM_SEQ_PARTITIONS_POW2] + max_logits_offs = ( + seq_idx * stride_exp_sums_n + head_idx * stride_exp_sums_h + seq_part_offs + ) + max_logits_mask = seq_part_offs < num_partitions + max_logits = tl.load( + max_logits_ptr + max_logits_offs, + mask=max_logits_mask, + other=float("-inf"), + ) + + # find max_logit + max_logit = tl.max(max_logits, axis=0) + + # load exp_sum [MAX_NUM_SEQ_PARTITIONS_POW2] + exp_sums_offs = ( + seq_idx * stride_exp_sums_n + head_idx * stride_exp_sums_h + seq_part_offs + ) + exp_sums_mask = seq_part_offs < num_partitions + exp_sums = tl.load( + exp_sums_ptr + exp_sums_offs, + mask=exp_sums_mask, + other=0.0, + ) + + # rescaled_exp_sum and global_exp_sum + # [MAX_NUM_SEQ_PARTITIONS_POW2] + rescaled_exp_sum = exp_sums * tl.exp(max_logits - max_logit) + global_exp_sum += tl.sum(rescaled_exp_sum, axis=0) + rescaled_exp_sum /= global_exp_sum + + # load logits + logits_offs = ( + seq_idx * stride_logits_n + + head_idx * stride_logits_h + + seq_part_offs[:, None] * stride_logits_b + + head_sz_offs[None, :] + ) + logits_mask = (seq_part_offs[:, None] < num_partitions) & ( + head_sz_offs[None, :] < HEAD_SZ + ) + + logits = tl.load(logits_ptr + logits_offs, mask=logits_mask, other=0.0) + acc += tl.sum(logits * rescaled_exp_sum[:, None], axis=0) + + # store the final output + out_ptr = seq_idx * stride_out_n + head_idx * stride_out_h + head_sz_offs + out_mask = head_sz_offs < HEAD_SZ + tl.store(out + out_ptr, acc.to(out.dtype.element_ty), mask=out_mask) + + +@triton.jit +def _paged_attn_decode_v2_w_dot_kernel( + exp_sums_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + max_logits_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + logits_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz, head_sz] + q_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + k_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + v_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + blk_tables_ptrs, # [num_seqs, max_num_blks_per_seq] + seq_lens_ptr, # [num_seqs] + scale, + k_scale, + v_scale, + alibi_slopes, + stride_max_logits_s, + stride_max_logits_nh, + stride_max_logits_p, + stride_logits_s, + stride_logits_nh, + stride_logits_p, + stride_logits_g, + stride_q_s, + stride_q_nh, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_bt_s, + compute_type: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + QUERY_GRP_SZ_POW2: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + seq_part_idx = tl.program_id(2) + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + if seq_part_idx * SEQ_PARTITION_SZ >= seq_len: + return + + seq_start_idx = seq_part_idx * SEQ_PARTITION_SZ + if seq_start_idx >= seq_len: + return + + seq_end_idx = tl.minimum(seq_start_idx + SEQ_PARTITION_SZ, seq_len) + + num_kv_blks = tl.cdiv(seq_end_idx - seq_start_idx, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + q_grp_offs = tl.arange(0, QUERY_GRP_SZ_POW2) + + # load alibi slopes[QUERY_GRP_SZ_POW2] + if alibi_slopes is None: + alibi_slope = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + else: + alibi_slope = tl.load( + alibi_slopes + kv_head_idx * QUERY_GRP_SZ + q_grp_offs, + mask=q_grp_offs < QUERY_GRP_SZ, + other=0.0, + ) + + # load q[QUERY_GRP_SZ_POW2, HEAD_SZ_POW2] + q_offs = ( + seq_idx * stride_q_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_q_nh + + head_sz_offs[None, :] + ) + q_mask = (q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_sz_offs[None, :] < HEAD_SZ) + q = tl.load(q_ptr + q_offs, mask=q_mask, other=0.0) + q = (q * scale).to(compute_type) + + acc = tl.zeros([QUERY_GRP_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + float("-inf") + exp_sum = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] + ) + kv_blk_start = seq_part_idx * (SEQ_PARTITION_SZ // KV_BLK_SZ) + blk_tables_start_ptr = blk_tables_ptrs + seq_idx * stride_bt_s + for b in range(num_kv_blks): + kv_blk_idx = kv_blk_start + b + kv_blk_nums = tl.load(blk_tables_start_ptr + kv_blk_idx) + + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = kv_blk_idx * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + + # load k[KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + k = k_0.to(tl.float32) * k_scale if k_0.dtype.is_fp8() else k_0 + k = k.to(compute_type) + + # qk: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + qk = tl.dot(q, k.T, out_dtype=tl.float32) + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + + if alibi_slopes is not None: + qk += (alibi_slope[:, None] * (blk_seq_offs - seq_len + 1)[None, :]).to( + tl.float32 + ) + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + + max_logit_new = tl.maximum(max_logit, tl.max(qk, axis=1)) + + # p: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new[:, None]) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # v: [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + v = v_0.to(tl.float32) * v_scale if v_0.dtype.is_fp8() else v_0 + v = v.to(compute_type) + + p = p.to(v.dtype) + acc += tl.dot(p, v, out_dtype=tl.float32) + + exp_sum = exp_sum * alpha + tl.sum(p, axis=1) + max_logit = max_logit_new + + acc = acc / exp_sum[:, None] + + max_logits_offs = ( + seq_idx * stride_max_logits_s + + kv_head_idx * stride_max_logits_nh + + seq_part_idx * stride_max_logits_p + + q_grp_offs + ) + m_grp_mask = q_grp_offs < QUERY_GRP_SZ + tl.store(max_logits_ptr + max_logits_offs, max_logit, mask=m_grp_mask) + tl.store(exp_sums_ptr + max_logits_offs, exp_sum, mask=m_grp_mask) + + logits_offs = seq_idx * stride_logits_s + logits_offs += kv_head_idx * stride_logits_nh + logits_offs += ( + seq_part_idx * stride_logits_p + + q_grp_offs[:, None] * stride_logits_g + + head_sz_offs[None, :] + ) + + tl.store(logits_ptr + logits_offs, acc, mask=q_mask) + + +@triton.jit +def _paged_attn_decode_v2_w_dot_reduce_kernel( + out_ptr, # [num_seqs, num_kv_heads, q_grp_sz, head_sz] + exp_sums_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + max_logits_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + logits_ptrs, # [num_seqs, num_kv_heads, max_parts, q_grp_sz, head_sz] + seq_lens_ptr, # [num_seqs] + stride_o_s, + stride_o_h, + stride_exp_sums_s, + stride_exp_sums_h, + stride_exp_sums_p, + stride_logits_s, + stride_logits_h, + stride_logits_p, + stride_logits_g, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + QUERY_GRP_SZ_POW2: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + + seq_len = tl.load(seq_lens_ptr + seq_idx) + num_partitions = tl.cdiv(seq_len, SEQ_PARTITION_SZ) + + part_offs = tl.arange(0, MAX_NUM_SEQ_PARTITIONS_POW2) + q_grp_offs = tl.arange(0, QUERY_GRP_SZ_POW2) + head_offs = tl.arange(0, HEAD_SZ_POW2) + + # get global max logit + exp_sums_offs = ( + seq_idx * stride_exp_sums_s + + kv_head_idx * stride_exp_sums_h + + part_offs[:, None] * stride_exp_sums_p + + q_grp_offs[None, :] + ) + exp_sums_mask = (part_offs[:, None] < num_partitions) & ( + q_grp_offs[None, :] < QUERY_GRP_SZ + ) + + # max_logits: [MAX_NUM_SEQ_PARTITIONS_POW2, QUERY_GRP_SZ_POW2] + max_logits = tl.load( + max_logits_ptr + exp_sums_offs, mask=exp_sums_mask, other=float("-inf") + ) + # max_logit: [QUERY_GRP_SZ_POW2] + ml = tl.max(max_logits, axis=0) + + # Rescale the exp sums and compute the global sum + # exp_sums: [MAX_NUM_SEQ_PARTITIONS, QUERY_GRP_SZ_POW2] + exp_sums = tl.load(exp_sums_ptr + exp_sums_offs, mask=exp_sums_mask, other=0.0) + exp_sums *= tl.exp(max_logits - ml[None, :]) + + # exp_sum: [QUERY_GRP_SZ_POW2] + exp_sum = tl.sum(exp_sums, axis=0) + + # p: [MAX_NUM_SEQ_PARTITIONS_POW2, QUERY_GRP_SZ_POW2] + p = exp_sums / exp_sum[None, :] + p = tl.reshape(p, (MAX_NUM_SEQ_PARTITIONS_POW2, QUERY_GRP_SZ_POW2, 1)) + + # logits_offset + logits_offset = ( + seq_idx * stride_logits_s + + kv_head_idx * stride_logits_h + + part_offs[:, None, None] * stride_logits_p + + q_grp_offs[None, :, None] * stride_logits_g + + head_offs[None, None, :] + ) + # load logits + logits_mask = (part_offs[:, None] < num_partitions) & ( + q_grp_offs[None, :] < QUERY_GRP_SZ + ) + logits = tl.load( + logits_ptrs + logits_offset, mask=logits_mask[:, :, None], other=0.0 + ) + + # out: [QUERY_GRP_SZ_POW2, HEAD_SZ_POW2] + out = tl.sum((logits * p).to(tl.float32), axis=0).to(out_ptr.dtype.element_ty) + + # store output + out_offs = ( + seq_idx * stride_o_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_o_h + + head_offs[None, :] + ) + tl.store( + out_ptr + out_offs, + out, + mask=(q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_offs[None, :] < HEAD_SZ), + ) + + +def paged_attn_decode_v1_per_token_quant( + output: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz] + query: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz] + key_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + value_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + block_tables: torch.Tensor, # [num_seqs, max_num_blks_per_seq] + seq_lens: torch.Tensor, # [num_seqs] + max_seq_len: int, + compute_type, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +): + """ + #TODO: Add Doc + """ + + num_seqs = query.shape[0] + num_q_heads = query.shape[1] + kv_blk_sz = key_cache.shape[2] + head_sz = key_cache.shape[3] + query_grp_sz = query.shape[1] // num_kv_heads + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + kv_blk_sz_pow2 = triton.next_power_of_2(kv_blk_sz) + head_sz_pow2 = triton.next_power_of_2(head_sz) + + # MHA- Multi-Head Attention + if query_grp_sz == 1: + grid = (num_q_heads, num_seqs, 1) + _paged_attn_decode_v1_wo_dot_kernel_per_token_quant[grid]( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + alibi_slopes, + scale, + k_scale, + v_scale, + query.stride(0), + query.stride(1), + output.stride(0), + output.stride(1), + output.stride(2), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + block_tables.stride(0), + k_scale.stride(0), + k_scale.stride(1), + k_scale.stride(2), + compute_type=compute_type, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz_pow2, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + MAX_SEQ_LEN_POW2=max_seq_len, + ) + # GQA - Grouped Query Attention + else: + grid = (num_seqs, num_kv_heads, 1) + if query_grp_sz <= 16: + query_grp_sz_pow2 = 16 + else: + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + _paged_attn_decode_v1_w_dot_kernel_per_token_quant[grid]( + output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + alibi_slopes, + scale, + k_scale, + v_scale, + output.stride(0), + output.stride(1), + output.stride(2), + query.stride(0), + query.stride(1), + query.stride(2), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + key_cache.stride(3), + block_tables.stride(0), + block_tables.stride(1), + k_scale.stride(0), + k_scale.stride(1), + k_scale.stride(2), + compute_type=compute_type, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + QUERY_GRP_SZ_POW2=query_grp_sz_pow2, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz, + ) + + +@triton.jit +def _paged_attn_decode_v1_wo_dot_kernel_per_token_quant( + out, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + q_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + k_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + v_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + blk_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_q_heads] + scale, + k_scale_ptr, # [num_blks, num_kv_heads, kv_blk_sz] + v_scale_ptr, # [num_blks, num_kv_heads, kv_blk_sz] + stride_q_s, + stride_q_h, + stride_o_s, + stride_o_nh, + stride_o_hs, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_bt_s, + stride_k_scale_b, + stride_k_scale_nh, + stride_k_scale_kb, + compute_type: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + MAX_SEQ_LEN_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + head_idx = tl.program_id(axis=0) + seq_idx = tl.program_id(axis=1) + kv_head_idx = head_idx // QUERY_GRP_SZ + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + num_kv_blks = tl.cdiv(seq_len, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + + # load alibi slopes [1] + if alibi_slopes_ptr is None: + alibi_slope = 0.0 + else: + alibi_slope = tl.load(alibi_slopes_ptr + head_idx) + + # load q [1, HEAD_SZ_POW2] + q_offs = seq_idx * stride_q_s + head_idx * stride_q_h + head_sz_offs + q = tl.load(q_ptr + q_offs, mask=head_sz_offs < HEAD_SZ) + q = (q * scale).to(compute_type) + + acc = tl.zeros([KV_BLK_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = float("-inf") + exp_sum = 0.0 + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] + ) + k_scale_offs = kv_head_idx * stride_k_scale_nh + blk_offs * stride_k_scale_kb + blk_tbl_start_ptr = blk_tables_ptr + seq_idx * stride_bt_s + + for b in range(num_kv_blks): + kv_blk_nums = tl.load(blk_tbl_start_ptr + b) + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = b * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + kv_scale_mask = (blk_seq_offs < seq_len) & (blk_offs < KV_BLK_SZ) + kv_scale_offs = kv_blk_nums * stride_k_scale_b + k_scale_offs + + # load k [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_scale = tl.load(k_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + if k_0.dtype.is_fp8(): + k = k_0.to(tl.float32) * k_scale[:, None] + else: + k = k_0 + k = k.to(compute_type) + + # qk #[KV_BLK_SZ_POW2] + qk = tl.sum((q[None, :] * k).to(tl.float32), axis=1) + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + if alibi_slopes_ptr is not None: + qk += (alibi_slope * (blk_seq_offs - seq_len + 1)).to(tl.float32) + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + + max_logit_new = tl.maximum(tl.max(qk, axis=0), max_logit) + + # p: [KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # load v [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + v_scale = tl.load(v_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask) + if v_0.dtype.is_fp8(): + v = v_0.to(tl.float32) * v_scale[:, None] + else: + v = v_0 + v = v.to(compute_type) + + acc += p[:, None] * v + + exp_sum = exp_sum * alpha + tl.sum(p, axis=0) + max_logit = max_logit_new + + acc = acc / exp_sum + + offs_out = seq_idx * stride_o_s + head_idx * stride_o_nh + head_sz_offs + out_mask = head_sz_offs < HEAD_SZ + tl.store( + out + offs_out, tl.sum(acc, axis=0).to(out.dtype.element_ty), mask=out_mask + ) + + +@triton.jit +def _paged_attn_decode_v1_w_dot_kernel_per_token_quant( + out_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + q_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + k_cache_ptr, # [num_blocks, num_kv_heads, kv_blk_sz, head_sz] + v_cache_ptr, # [num_blocks, num_kv_heads, kv_blk_sz, head_sz] + blk_tables_ptr, # [num_seqs, max_num_blks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes, # [num_kv_heads*query_grp_sz] + scale, + k_scale_ptr, # [num_blks, num_kv_heads, kv_blk_sz] + v_scale_ptr, # [num_blks, num_kv_heads, kv_blk_sz] + stride_o_s, + stride_o_nh, + stride_o_hs, + stride_q_s, + stride_q_nh, + stride_q_hs, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_k_hs, + stride_bt_s, + stride_bt_nb, + stride_k_scale_b, + stride_k_scale_nh, + stride_k_scale_kb, + compute_type: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + QUERY_GRP_SZ_POW2: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + num_kv_blks = tl.cdiv(seq_len, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + q_grp_offs = tl.arange(0, QUERY_GRP_SZ_POW2) + + # load alibi slopes[QUERY_GRP_SZ_POW2] + if alibi_slopes is None: + alibi_slope = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + else: + alibi_slope = tl.load( + alibi_slopes + kv_head_idx * QUERY_GRP_SZ + q_grp_offs, + mask=q_grp_offs < QUERY_GRP_SZ, + other=0.0, + ) + + q_offs = ( + seq_idx * stride_q_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_q_nh + + head_sz_offs[None, :] * stride_q_hs + ) + + # load q[QUERY_GRP_SZ_POW2, HEAD_SZ_POW2] + q_mask = (q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_sz_offs[None, :] < HEAD_SZ) + + q = tl.load(q_ptr + q_offs, mask=q_mask, other=0.0) + q = (q * scale).to(compute_type) + + acc = tl.zeros([QUERY_GRP_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + float("-inf") + exp_sum = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] * stride_k_hs + ) + k_scale_offs = kv_head_idx * stride_k_scale_nh + blk_offs * stride_k_scale_kb + blk_tbl_start_ptr = blk_tables_ptr + seq_idx * stride_bt_s + + for b in range(num_kv_blks): + kv_blk_nums = tl.load(blk_tbl_start_ptr + b) + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = b * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + + kv_scale_mask = (blk_seq_offs < seq_len) & (blk_offs < KV_BLK_SZ) + kv_scale_offs = kv_blk_nums * stride_k_scale_b + k_scale_offs + + # load k[KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_scale = tl.load(k_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + k = k_0.to(tl.float32) * k_scale[:, None] if k_0.dtype.is_fp8() else k_0 + k = k.to(compute_type) + + # qk: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + qk = tl.dot(q, k.T, out_dtype=tl.float32) + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + + if alibi_slopes is not None: + qk += (alibi_slope[:, None] * (blk_seq_offs - seq_len + 1)[None, :]).to( + tl.float32 + ) + + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + max_logit_new = tl.maximum(tl.max(qk, axis=1), max_logit) + + # p: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new[:, None]) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # v: [KV_BLK_SZ, HEAD_SZ] + v_scale = tl.load(v_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + v = v_0.to(tl.float32) * v_scale[:, None] if v_0.dtype.is_fp8() else v_0 + v = v.to(compute_type) + + p = p.to(v.dtype) + acc += tl.dot(p, v, out_dtype=tl.float32) + + exp_sum = exp_sum * alpha + tl.sum(p, axis=1) + max_logit = max_logit_new + + acc = acc / exp_sum[:, None] + + out_offs = ( + seq_idx * stride_o_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_o_nh + + head_sz_offs[None, :] + ) + + out_mask = (q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_sz_offs[None, :] < HEAD_SZ) + tl.store(out_ptr + out_offs, acc.to(out_ptr.dtype.element_ty), mask=out_mask) + + +def paged_attn_decode_v2_per_token_quant( + output: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz], + query: torch.Tensor, # [num_seqs, num_kv_heads*query_grp_sz, head_sz], + key_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] , + value_cache: torch.Tensor, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] , + block_tables: torch.Tensor, # [num_seqs, max_num_blks_per_seq], + seq_lens: torch.Tensor, # [num_seqs], + max_seq_len: int, + compute_type, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: torch.Tensor, + v_scale: torch.Tensor, + max_num_partitions: int, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +): + """ + #TODO: Add Doc + """ + + num_seqs = query.shape[0] + num_q_heads = query.shape[1] + kv_blk_sz = key_cache.shape[2] + head_sz = key_cache.shape[3] + query_grp_sz = num_q_heads // num_kv_heads + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + + # Note: There is a bug in triton.next_power_of_2 function which causes it + # to update the passed in arg, so that's why we have a workaround here + # max_num_partitions_pow2 = triton.next_power_of_2(max_num_partitions) + if max_num_partitions == 0: + max_num_partitions_pow2 = 1 + else: + max_num_partitions_pow2 = 2 ** math.ceil(math.log2(max_num_partitions)) + + kv_blk_sz_pow2 = triton.next_power_of_2(kv_blk_sz) + head_sz_pow2 = triton.next_power_of_2(head_sz) + + # MHA + if query_grp_sz == 1: + grid = (num_q_heads, num_seqs, max_num_partitions) + shape_info = (num_seqs, num_q_heads, max_num_partitions) + exp_sums = torch.empty( + size=shape_info, dtype=torch.float32, device=output.device + ) + max_logits = torch.empty( + size=shape_info, dtype=torch.float32, device=output.device + ) + tmp_output = torch.empty( + (*shape_info, head_sz), dtype=output.dtype, device=output.device + ) + _paged_attn_decode_v2_wo_dot_kernel_per_token_quant[grid]( + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + scale, + k_scale, + v_scale, + alibi_slopes, + exp_sums.stride(0), + exp_sums.stride(1), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + query.stride(0), + query.stride(1), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + block_tables.stride(0), + block_tables.stride(1), + k_scale.stride(0), + k_scale.stride(1), + k_scale.stride(2), + compute_type=compute_type, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz_pow2, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + MAX_NUM_BLKS_PER_SEQ=block_tables.shape[1], + MAX_SEQ_LEN_POW2=max_seq_len, + ) + grid = (num_q_heads, num_seqs, 1) + _paged_attn_decode_v2_wo_dot_reduce_kernel_per_token_quant[grid]( + output, + exp_sums, + max_logits, + tmp_output, + seq_lens, + output.stride(0), + output.stride(1), + exp_sums.stride(0), + exp_sums.stride(1), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + MAX_NUM_SEQ_PARTITIONS=int(max_num_partitions), + MAX_NUM_SEQ_PARTITIONS_POW2=int(max_num_partitions_pow2), + ) + # GQA + else: + grid = (num_seqs, num_kv_heads, max_num_partitions) + shape_info = (num_seqs, num_kv_heads, max_num_partitions, query_grp_sz) + max_logits = torch.empty(shape_info, dtype=torch.float32, device=output.device) + exp_sums = torch.empty(shape_info, dtype=torch.float32, device=output.device) + tmp_output = torch.empty( + *shape_info, head_sz, dtype=output.dtype, device=output.device + ) + if query_grp_sz <= 16: + query_grp_sz_pow2 = 16 + else: + query_grp_sz_pow2 = triton.next_power_of_2(query_grp_sz) + _paged_attn_decode_v2_w_dot_kernel_per_token_quant[grid]( + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + block_tables, + seq_lens, + scale, + k_scale, + v_scale, + alibi_slopes, + exp_sums.stride(0), + exp_sums.stride(1), + exp_sums.stride(2), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + tmp_output.stride(3), + query.stride(0), + query.stride(1), + key_cache.stride(0), + key_cache.stride(1), + key_cache.stride(2), + block_tables.stride(0), + k_scale.stride(0), + k_scale.stride(1), + k_scale.stride(2), + compute_type=compute_type, + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + QUERY_GRP_SZ_POW2=query_grp_sz_pow2, + KV_BLK_SZ=kv_blk_sz, + KV_BLK_SZ_POW2=kv_blk_sz_pow2, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + ) + grid = (num_seqs, num_kv_heads, 1) + _paged_attn_decode_v2_w_dot_reduce_kernel_per_token_quant[grid]( + output, + exp_sums, + max_logits, + tmp_output, + seq_lens, + output.stride(0), + output.stride(1), + exp_sums.stride(0), + exp_sums.stride(1), + exp_sums.stride(2), + tmp_output.stride(0), + tmp_output.stride(1), + tmp_output.stride(2), + tmp_output.stride(3), + HEAD_SZ=head_sz, + HEAD_SZ_POW2=head_sz_pow2, + QUERY_GRP_SZ=query_grp_sz, + QUERY_GRP_SZ_POW2=query_grp_sz_pow2, + SEQ_PARTITION_SZ=_SEQ_PARTITION_SIZE, + MAX_NUM_SEQ_PARTITIONS=int(max_num_partitions), + MAX_NUM_SEQ_PARTITIONS_POW2=int(triton.next_power_of_2(max_num_partitions)), + ) + + +@triton.jit +def _paged_attn_decode_v2_wo_dot_kernel_per_token_quant( + exp_sums_ptr, + max_logits_ptr, + logits_ptr, + q_ptr, + k_cache_ptr, + v_cache_ptr, + blk_tables_ptr, + seq_lens_ptr, + scale, + k_scale_ptr, + v_scale_ptr, + alibi_slopes, + stride_exp_s, + stride_exp_h, + stride_logits_s, + stride_logits_h, + stride_logits_p, + stride_q_s, + stride_q_h, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_bt_s, + stride_bt_nb, + stride_k_scale_b, + stride_k_scale_nh, + stride_k_scale_kb, + compute_type: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, + MAX_NUM_BLKS_PER_SEQ: tl.constexpr, + MAX_SEQ_LEN_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + head_idx = tl.program_id(0) + seq_idx = tl.program_id(1) + seq_part_idx = tl.program_id(2) + kv_head_idx = head_idx // QUERY_GRP_SZ + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + if seq_part_idx * SEQ_PARTITION_SZ >= seq_len: + return + + seq_start_idx = seq_part_idx * SEQ_PARTITION_SZ + seq_end_idx = tl.minimum(seq_start_idx + SEQ_PARTITION_SZ, seq_len) + num_kv_blks = tl.cdiv(seq_end_idx - seq_start_idx, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + + # load alibi slopes + if alibi_slopes is None: + alibi_slope = 0.0 + else: + alibi_slope = tl.load(alibi_slopes + head_idx) + + # load q[HEAD_SZ] + q_offs = seq_idx * stride_q_s + head_idx * stride_q_h + head_sz_offs + q = tl.load(q_ptr + q_offs, mask=head_sz_offs < HEAD_SZ) + q = (q * scale).to(compute_type) + + acc = tl.zeros([KV_BLK_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = float("-inf") + exp_sum = 0.0 + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] + ) + kv_blk_start = seq_part_idx * (SEQ_PARTITION_SZ // KV_BLK_SZ) + k_scale_offs = kv_head_idx * stride_k_scale_nh + blk_offs * stride_k_scale_kb + blk_tables_start_ptr = blk_tables_ptr + seq_idx * stride_bt_s + + for b in range(num_kv_blks): + kv_blk_idx = kv_blk_start + b + kv_blk_nums = tl.load(blk_tables_start_ptr + kv_blk_idx * stride_bt_nb) + + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = kv_blk_idx * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + + kv_scale_mask = (blk_seq_offs < seq_len) & (blk_offs < KV_BLK_SZ) + kv_scale_offs = kv_blk_nums * stride_k_scale_b + k_scale_offs + + # load k[KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_scale = tl.load(k_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + k = k_0.to(tl.float32) * k_scale[:, None] if k_0.dtype.is_fp8() else k_0 + k = k.to(compute_type) + + # qk: [KV_BLK_SZ_POW2] + qk = tl.sum((q[None, :] * k).to(tl.float32), axis=1) + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + + if alibi_slopes is not None: + qk += (alibi_slope * (blk_seq_offs - seq_len + 1)).to(tl.float32) + qk = tl.where(blk_seq_offs < seq_len, qk, float("-inf")) + + max_logit_new = tl.maximum(max_logit, tl.max(qk, axis=0)) + + # p: [KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # v: [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + v_scale = tl.load(v_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + v = v_0.to(tl.float32) * v_scale[:, None] if v_0.dtype.is_fp8() else v_0 + v = v.to(compute_type) + + # acc: [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + acc += p[:, None] * v + + exp_sum = exp_sum * alpha + tl.sum(p, axis=0) + max_logit = max_logit_new + + acc = acc / exp_sum + + max_logits_offs = seq_idx * stride_exp_s + head_idx * stride_exp_h + seq_part_idx + + tl.store(max_logits_ptr + max_logits_offs, max_logit) + tl.store(exp_sums_ptr + max_logits_offs, exp_sum) + + logits_offs = ( + seq_idx * stride_logits_s + + head_idx * stride_logits_h + + seq_part_idx * stride_logits_p + + head_sz_offs + ) + logits_mask = head_sz_offs < HEAD_SZ + tl.store( + logits_ptr + logits_offs, + tl.sum(acc, axis=0).to(logits_ptr.dtype.element_ty), + mask=logits_mask, + ) + + +@triton.jit +def _paged_attn_decode_v2_wo_dot_reduce_kernel_per_token_quant( + out, + exp_sums_ptr, + max_logits_ptr, + logits_ptr, + seq_lens, + stride_out_n, + stride_out_h, + stride_exp_sums_n, + stride_exp_sums_h, + stride_logits_n, + stride_logits_h, + stride_logits_b, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + # get seq_idx, head_idx, seq_len + head_idx = tl.program_id(axis=0) + seq_idx = tl.program_id(axis=1) + + seq_len = tl.load(seq_lens + seq_idx) + num_partitions = tl.cdiv(seq_len, SEQ_PARTITION_SZ) + + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + seq_part_offs = tl.arange(0, MAX_NUM_SEQ_PARTITIONS_POW2) + + max_logit = float("-inf") + acc = tl.zeros([HEAD_SZ_POW2], dtype=tl.float32) + global_exp_sum = tl.zeros([1], dtype=tl.float32) + + # load max_logits [MAX_NUM_SEQ_PARTITIONS_POW2] + max_logits_offs = ( + seq_idx * stride_exp_sums_n + head_idx * stride_exp_sums_h + seq_part_offs + ) + max_logits_mask = seq_part_offs < num_partitions + max_logits = tl.load( + max_logits_ptr + max_logits_offs, + mask=max_logits_mask, + other=float("-inf"), + ) + + # find max_logit + max_logit = tl.max(max_logits, axis=0) + + # load exp_sum [MAX_NUM_SEQ_PARTITIONS_POW2] + exp_sums_offs = ( + seq_idx * stride_exp_sums_n + head_idx * stride_exp_sums_h + seq_part_offs + ) + exp_sums_mask = seq_part_offs < num_partitions + exp_sums = tl.load( + exp_sums_ptr + exp_sums_offs, + mask=exp_sums_mask, + other=0.0, + ) + + # rescaled_exp_sum and global_exp_sum + # [MAX_NUM_SEQ_PARTITIONS_POW2] + rescaled_exp_sum = exp_sums * tl.exp(max_logits - max_logit) + global_exp_sum += tl.sum(rescaled_exp_sum, axis=0) + rescaled_exp_sum /= global_exp_sum + + # load logits + logits_offs = ( + seq_idx * stride_logits_n + + head_idx * stride_logits_h + + seq_part_offs[:, None] * stride_logits_b + + head_sz_offs[None, :] + ) + logits_mask = (seq_part_offs[:, None] < num_partitions) & ( + head_sz_offs[None, :] < HEAD_SZ + ) + + logits = tl.load(logits_ptr + logits_offs, mask=logits_mask, other=0.0) + acc += tl.sum(logits * rescaled_exp_sum[:, None], axis=0) + + # store the final output + out_ptr = seq_idx * stride_out_n + head_idx * stride_out_h + head_sz_offs + out_mask = head_sz_offs < HEAD_SZ + tl.store(out + out_ptr, acc.to(out.dtype.element_ty), mask=out_mask) + + +@triton.jit +def _paged_attn_decode_v2_w_dot_kernel_per_token_quant( + exp_sums_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + max_logits_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + logits_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz, head_sz] + q_ptr, # [num_seqs, num_kv_heads * query_grp_sz, head_sz] + k_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + v_cache_ptr, # [num_blks, num_kv_heads, kv_blk_sz, head_sz] + blk_tables_ptrs, # [num_seqs, max_num_blks_per_seq] + seq_lens_ptr, # [num_seqs] + scale, + k_scale_ptr, # [num_blks, num_kv_heads, kv_blk_sz] + v_scale_ptr, # [num_blks, num_kv_heads, kv_blk_sz] + alibi_slopes, + stride_max_logits_s, + stride_max_logits_nh, + stride_max_logits_p, + stride_logits_s, + stride_logits_nh, + stride_logits_p, + stride_logits_g, + stride_q_s, + stride_q_nh, + stride_k_b, + stride_k_nh, + stride_k_kb, + stride_bt_s, + stride_k_scale_b, + stride_k_scale_nh, + stride_k_scale_kb, + compute_type: tl.constexpr, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + QUERY_GRP_SZ_POW2: tl.constexpr, + KV_BLK_SZ: tl.constexpr, + KV_BLK_SZ_POW2: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + seq_part_idx = tl.program_id(2) + + log2e: tl.constexpr = 1.4426950408889634 + + seq_len = tl.load(seq_lens_ptr + seq_idx) + + if seq_part_idx * SEQ_PARTITION_SZ >= seq_len: + return + + seq_start_idx = seq_part_idx * SEQ_PARTITION_SZ + if seq_start_idx >= seq_len: + return + + seq_end_idx = tl.minimum(seq_start_idx + SEQ_PARTITION_SZ, seq_len) + + num_kv_blks = tl.cdiv(seq_end_idx - seq_start_idx, KV_BLK_SZ) + + blk_offs = tl.arange(0, KV_BLK_SZ_POW2) + head_sz_offs = tl.arange(0, HEAD_SZ_POW2) + q_grp_offs = tl.arange(0, QUERY_GRP_SZ_POW2) + + # load alibi slopes[QUERY_GRP_SZ_POW2] + if alibi_slopes is None: + alibi_slope = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + else: + alibi_slope = tl.load( + alibi_slopes + kv_head_idx * QUERY_GRP_SZ + q_grp_offs, + mask=q_grp_offs < QUERY_GRP_SZ, + other=0.0, + ) + + # load q[QUERY_GRP_SZ_POW2, HEAD_SZ_POW2] + q_offs = ( + seq_idx * stride_q_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_q_nh + + head_sz_offs[None, :] + ) + q_mask = (q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_sz_offs[None, :] < HEAD_SZ) + q = tl.load(q_ptr + q_offs, mask=q_mask, other=0.0) + q = (q * scale).to(compute_type) + + acc = tl.zeros([QUERY_GRP_SZ_POW2, HEAD_SZ_POW2], dtype=tl.float32) + max_logit = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + float("-inf") + exp_sum = tl.zeros([QUERY_GRP_SZ_POW2], dtype=tl.float32) + + kv_offs = ( + kv_head_idx * stride_k_nh + + blk_offs[:, None] * stride_k_kb + + head_sz_offs[None, :] + ) + kv_blk_start = seq_part_idx * (SEQ_PARTITION_SZ // KV_BLK_SZ) + k_scale_offs = kv_head_idx * stride_k_scale_nh + blk_offs * stride_k_scale_kb + blk_tables_start_ptr = blk_tables_ptrs + seq_idx * stride_bt_s + for b in range(num_kv_blks): + kv_blk_idx = kv_blk_start + b + kv_blk_nums = tl.load(blk_tables_start_ptr + kv_blk_idx) + + kv_blk_offs = kv_blk_nums * stride_k_b + kv_offs + blk_seq_offs = kv_blk_idx * KV_BLK_SZ + blk_offs + kv_mask = ( + (blk_seq_offs[:, None] < seq_len) + & (blk_offs[:, None] < KV_BLK_SZ) + & (head_sz_offs[None, :] < HEAD_SZ) + ) + + kv_scale_mask = (blk_seq_offs < seq_len) & (blk_offs < KV_BLK_SZ) + kv_scale_offs = kv_blk_nums * stride_k_scale_b + k_scale_offs + + # load k[KV_BLK_SZ_POW2, HEAD_SZ_POW2] + k_scale = tl.load(k_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + k_0 = tl.load(k_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + k = k_0.to(tl.float32) * k_scale[:, None] if k_0.dtype.is_fp8() else k_0 + k = k.to(compute_type) + + # qk: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + qk = tl.dot(q, k.T, out_dtype=tl.float32) + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + + if alibi_slopes is not None: + qk += (alibi_slope[:, None] * (blk_seq_offs - seq_len + 1)[None, :]).to( + tl.float32 + ) + qk = tl.where( + (q_grp_offs[:, None] < QUERY_GRP_SZ) & (blk_seq_offs[None, :] < seq_len), + qk, + float("-inf"), + ) + + max_logit_new = tl.maximum(max_logit, tl.max(qk, axis=1)) + + # p: [QUERY_GRP_SZ_POW2, KV_BLK_SZ_POW2] + p = tl.math.exp2((qk - max_logit_new[:, None]) * log2e) + alpha = tl.math.exp2((max_logit - max_logit_new) * log2e) + acc *= alpha[:, None] + + # v: [KV_BLK_SZ_POW2, HEAD_SZ_POW2] + v_scale = tl.load(v_scale_ptr + kv_scale_offs, mask=kv_scale_mask, other=0.0) + v_0 = tl.load(v_cache_ptr + kv_blk_offs, mask=kv_mask, other=0.0) + v = v_0.to(tl.float32) * v_scale[:, None] if v_0.dtype.is_fp8() else v_0 + v = v.to(compute_type) + + p = p.to(v.dtype) + acc += tl.dot(p, v, out_dtype=tl.float32) + + exp_sum = exp_sum * alpha + tl.sum(p, axis=1) + max_logit = max_logit_new + + acc = acc / exp_sum[:, None] + + max_logits_offs = ( + seq_idx * stride_max_logits_s + + kv_head_idx * stride_max_logits_nh + + seq_part_idx * stride_max_logits_p + + q_grp_offs + ) + m_grp_mask = q_grp_offs < QUERY_GRP_SZ + tl.store(max_logits_ptr + max_logits_offs, max_logit, mask=m_grp_mask) + tl.store(exp_sums_ptr + max_logits_offs, exp_sum, mask=m_grp_mask) + + logits_offs = seq_idx * stride_logits_s + logits_offs += kv_head_idx * stride_logits_nh + logits_offs += ( + seq_part_idx * stride_logits_p + + q_grp_offs[:, None] * stride_logits_g + + head_sz_offs[None, :] + ) + + tl.store(logits_ptr + logits_offs, acc, mask=q_mask) + + +@triton.jit +def _paged_attn_decode_v2_w_dot_reduce_kernel_per_token_quant( + out_ptr, # [num_seqs, num_kv_heads, q_grp_sz, head_sz] + exp_sums_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + max_logits_ptr, # [num_seqs, num_kv_heads, max_parts, q_grp_sz] + logits_ptrs, # [num_seqs, num_kv_heads, max_parts, q_grp_sz, head_sz] + seq_lens_ptr, # [num_seqs] + stride_o_s, + stride_o_h, + stride_exp_sums_s, + stride_exp_sums_h, + stride_exp_sums_p, + stride_logits_s, + stride_logits_h, + stride_logits_p, + stride_logits_g, + HEAD_SZ: tl.constexpr, + HEAD_SZ_POW2: tl.constexpr, + QUERY_GRP_SZ: tl.constexpr, + QUERY_GRP_SZ_POW2: tl.constexpr, + SEQ_PARTITION_SZ: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS: tl.constexpr, + MAX_NUM_SEQ_PARTITIONS_POW2: tl.constexpr, +): + """ + #TODO: Add Doc + """ + + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + + seq_len = tl.load(seq_lens_ptr + seq_idx) + num_partitions = tl.cdiv(seq_len, SEQ_PARTITION_SZ) + + part_offs = tl.arange(0, MAX_NUM_SEQ_PARTITIONS_POW2) + q_grp_offs = tl.arange(0, QUERY_GRP_SZ_POW2) + head_offs = tl.arange(0, HEAD_SZ_POW2) + + # get global max logit + exp_sums_offs = ( + seq_idx * stride_exp_sums_s + + kv_head_idx * stride_exp_sums_h + + part_offs[:, None] * stride_exp_sums_p + + q_grp_offs[None, :] + ) + exp_sums_mask = (part_offs[:, None] < num_partitions) & ( + q_grp_offs[None, :] < QUERY_GRP_SZ + ) + + # max_logits: [MAX_NUM_SEQ_PARTITIONS_POW2, QUERY_GRP_SZ_POW2] + max_logits = tl.load( + max_logits_ptr + exp_sums_offs, mask=exp_sums_mask, other=float("-inf") + ) + # max_logit: [QUERY_GRP_SZ_POW2] + ml = tl.max(max_logits, axis=0) + + # Rescale the exp sums and compute the global sum + # exp_sums: [MAX_NUM_SEQ_PARTITIONS, QUERY_GRP_SZ_POW2] + exp_sums = tl.load(exp_sums_ptr + exp_sums_offs, mask=exp_sums_mask, other=0.0) + exp_sums *= tl.exp(max_logits - ml[None, :]) + + # exp_sum: [QUERY_GRP_SZ_POW2] + exp_sum = tl.sum(exp_sums, axis=0) + + # p: [MAX_NUM_SEQ_PARTITIONS_POW2, QUERY_GRP_SZ_POW2] + p = exp_sums / exp_sum[None, :] + p = tl.reshape(p, (MAX_NUM_SEQ_PARTITIONS_POW2, QUERY_GRP_SZ_POW2, 1)) + + # logits_offset + logits_offset = ( + seq_idx * stride_logits_s + + kv_head_idx * stride_logits_h + + part_offs[:, None, None] * stride_logits_p + + q_grp_offs[None, :, None] * stride_logits_g + + head_offs[None, None, :] + ) + # load logits + logits_mask = (part_offs[:, None] < num_partitions) & ( + q_grp_offs[None, :] < QUERY_GRP_SZ + ) + logits = tl.load( + logits_ptrs + logits_offset, mask=logits_mask[:, :, None], other=0.0 + ) + + # out: [QUERY_GRP_SZ_POW2, HEAD_SZ_POW2] + out = tl.sum((logits * p).to(tl.float32), axis=0).to(out_ptr.dtype.element_ty) + + # store output + out_offs = ( + seq_idx * stride_o_s + + (kv_head_idx * QUERY_GRP_SZ + q_grp_offs[:, None]) * stride_o_h + + head_offs[None, :] + ) + tl.store( + out_ptr + out_offs, + out, + mask=(q_grp_offs[:, None] < QUERY_GRP_SZ) & (head_offs[None, :] < HEAD_SZ), + ) diff --git a/aiter/ops/triton/pa_prefill.py b/aiter/ops/triton/pa_prefill.py new file mode 100644 index 0000000000000000000000000000000000000000..a75727d903bbe8553ae1f2a2c428e5333f05dfad --- /dev/null +++ b/aiter/ops/triton/pa_prefill.py @@ -0,0 +1,859 @@ +# SPDX-License-Identifier: MIT + +# SPDX-License-Identifier: MIT + +# The kernels in this file are adapted from LightLLM's context_attention_fwd: +# https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py + +import os + +import json +import torch +import triton +import triton.language as tl +from triton.utils import annotate_hint +import functools +from typing import Any, Dict, Optional, List +from aiter import logger +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import aiter.ops.triton.utils.arch_info as arch_info + +BASE_BLOCK = 128 +NUM_WARPS = 4 + +if triton.__version__ >= "2.1.0": + + @triton.jit + def _fwd_kernel( + Q, + K, + V, + K_cache, + V_cache, + B_Loc, + sm_scale, + k_scale, + v_scale, + B_Start_Loc, + B_Seqlen, + block_size: tl.constexpr, + x: tl.constexpr, + Out, + stride_b_loc_b, + stride_b_loc_s, + stride_qbs: tl.constexpr, + stride_qh: tl.constexpr, + stride_qd: tl.constexpr, + stride_kbs: tl.constexpr, + stride_kh: tl.constexpr, + stride_kd: tl.constexpr, + stride_vbs: tl.constexpr, + stride_vh: tl.constexpr, + stride_vd: tl.constexpr, + stride_obs: tl.constexpr, + stride_oh: tl.constexpr, + stride_od: tl.constexpr, + stride_k_cache_bs: tl.constexpr, + stride_k_cache_h: tl.constexpr, + stride_k_cache_d: tl.constexpr, + stride_k_cache_bl: tl.constexpr, + stride_k_cache_x: tl.constexpr, + stride_v_cache_bs: tl.constexpr, + stride_v_cache_h: tl.constexpr, + stride_v_cache_d: tl.constexpr, + stride_v_cache_bl: tl.constexpr, + num_queries_per_kv: int, + IN_PRECISION: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, # head size + BLOCK_DMODEL_PADDED: tl.constexpr, # head size padded to a power of 2 + BLOCK_N: tl.constexpr, + SLIDING_WINDOW: tl.constexpr, + SKIP_DECODE: tl.constexpr, + USE_MATRIX_LOAD: tl.constexpr, # bool + HEAD_DIM_PAD_REQ: tl.constexpr, # bool + max_input_len, + ): + if USE_MATRIX_LOAD: + # if use matrix load, need make sure all cache tokens in BLOCK_N are in the same cache block + tl.static_assert(BLOCK_N <= block_size and block_size % BLOCK_N == 0) + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // num_queries_per_kv + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1) + cur_batch_query_len = (cur_batch_in_all_stop_index - + cur_batch_in_all_start_index) + cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len + cur_batch_ctx_len = annotate_hint(cur_batch_ctx_len, "non-negative") + + if SKIP_DECODE and cur_batch_query_len == 1: + return + + # start position inside of the query + # generally, N goes over kv, while M goes over query_len + block_start_loc = BLOCK_M * start_m + + # initialize offsets + # [N]; starts at 0 + offs_n = tl.arange(0, BLOCK_N) + # [D]; starts at 0 + offs_d = tl.arange(0, BLOCK_DMODEL_PADDED) + # [M]; starts at current position in query + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + # [M,D] + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + offs_d[None, :] * stride_qd) + + if HEAD_DIM_PAD_REQ: + dim_mask = tl.where( + tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, + 0).to(tl.int1) # [D] + + q = tl.load(Q + off_q, + mask=dim_mask[None, :] & + (offs_m[:, None] < cur_batch_query_len), + other=0.0) # [M,D] + else: + q = tl.load(Q + off_q, + mask=(offs_m[:, None] < cur_batch_query_len), + other=0.0) # [M,D] + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") # [M] + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) # [M] + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], + dtype=tl.float32) # [M,D] + + # compute query against context (no causal mask here) + for start_n in range(0, cur_batch_ctx_len, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + if USE_MATRIX_LOAD: + # all cache tokens in BLOCK_N are in the same cache block + bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + + (start_n // block_size) * stride_b_loc_s) + # [D,N] + off_k = (bn * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_n[None, :]) % block_size) * + stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x) + # [N,D] + off_v = ( + bn * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h) + else: + bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + + ((start_n + offs_n) // block_size) * stride_b_loc_s, + mask=(start_n + offs_n) < cur_batch_ctx_len, + other=0) # [N] + # we explicit tell compiler bn is non-negative + bn = annotate_hint(bn, "non-negative") + # set constancy to 16 for v cache load using load_dwordx4 + bn = tl.max_constancy(bn, [16]) + # [D,N] + off_k = (bn[None, :] * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_n[None, :]) % block_size) * + stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x) + # [N,D] + off_v = ( + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + + offs_d[None, :] * stride_v_cache_d + + ((start_n + offs_n[:, None]) % block_size) * stride_v_cache_bl) + + if HEAD_DIM_PAD_REQ: + if block_size % BLOCK_N == 0: + # block_size % BLOCK_N == 0, seq will never meet memory boundray out + k_load = tl.load(K_cache + off_k, + mask=dim_mask[:, None], + other=0.0) # [D,N] + else: + k_load = tl.load(K_cache + off_k, + mask=dim_mask[:, None] & + ((start_n + offs_n[None, :]) < cur_batch_ctx_len), + other=0.0) # [D,N] + else: + if block_size % BLOCK_N == 0: + k_load = tl.load(K_cache + off_k) # [D,N] + else: + k_load = tl.load(K_cache + off_k, + mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len, + other=0.0) # [D,N] + + if k_load.dtype.is_fp8(): + k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype) + else: + k = k_load + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) # [M,N] + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) + qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, + float("-inf")) + qk *= sm_scale + if SLIDING_WINDOW > 0: + qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) - + (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, + float("-inf")) + + # -- compute m_ij, p, l_ij + m_j = tl.maximum(m_i, tl.max(qk, 1)) + if SLIDING_WINDOW > 0: + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_j = tl.where(m_j > float("-inf"), m_j, 0.0) + + # P : (BLOCK_M, BLOCK_SIZE) + p = tl.exp(qk - m_j[:, None]) + # l_j : (BLOCK_M,) + l_j = tl.sum(p, 1) + # alpha : (BLOCK_M, ) + alpha = tl.exp(m_i - m_j) + # scale acc + acc = acc * alpha[:, None] + # update acc + if USE_MATRIX_LOAD: + if HEAD_DIM_PAD_REQ: + v_load = tl.matrix_load( + V_cache + off_v, + shape=[block_size, BLOCK_DMODEL], + strides=[stride_v_cache_bl, stride_v_cache_d], + block_shape=[BLOCK_N, BLOCK_DMODEL_PADDED], + offsets=[(start_n % block_size).to(tl.int32), 0], + boundary_check=(1,)) # [N,D] + else: + v_load = tl.matrix_load( + V_cache + off_v, + shape=[block_size, BLOCK_DMODEL], + strides=[stride_v_cache_bl, stride_v_cache_d], + block_shape=[BLOCK_N, BLOCK_DMODEL_PADDED], + offsets=[(start_n % block_size).to(tl.int32), 0]) # [N,D] + else: + seq_mask = (start_n + offs_n[:, None]) < cur_batch_ctx_len + # set constancy of seq_mask to 8 for using vector load_dwordx4 + seq_mask = tl.max_constancy(seq_mask, [8, BLOCK_DMODEL_PADDED]) + if HEAD_DIM_PAD_REQ: + if block_size % BLOCK_N == 0: + # block_size % BLOCK_N == 0, seq will never meet memory boundray out + v_load = tl.load(V_cache + off_v, + mask=dim_mask[None, :], + other=0.0) # [N,D] + else: + v_load = tl.load(V_cache + off_v, + mask=dim_mask[None, :] & seq_mask, + other=0.0) # [N,D] + else: + if block_size % BLOCK_N == 0: + v_load = tl.load(V_cache + off_v) # [N,D] + else: + + v_load = tl.load(V_cache + off_v, + mask=seq_mask, + other=0.0) # [N,D] + + if v_load.dtype.is_fp8(): + v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype) + else: + v = v_load + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION) + # # update m_i and l_i + l_i = l_i * alpha + l_j + m_i = m_j + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + + offs_d[:, None] * stride_kd) + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + + offs_d[None, :] * stride_vd) + k_ptrs = K + off_k + v_ptrs = V + off_v + + # block_mask is 0 when we're already past the current query length + block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0) + + # compute query against itself (with causal mask) + for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + if HEAD_DIM_PAD_REQ: + k = tl.load(k_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=dim_mask[:, None] & + ((start_n + offs_n[None, :]) < cur_batch_query_len), + other=0.0) + else: + k = tl.load(k_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=((start_n + offs_n[None, :]) < cur_batch_query_len), + other=0.0) + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) + qk *= sm_scale + # apply causal mask + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, + float("-inf")) + if SLIDING_WINDOW > 0: + qk = tl.where( + offs_m[:, None] - (start_n + offs_n[None, :]) + < SLIDING_WINDOW, qk, float("-inf")) + + m_j = tl.maximum(m_i, tl.max(qk, 1)) + if SLIDING_WINDOW > 0: + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_j = tl.where(m_j > float("-inf"), m_j, 0.0) + + # P : (BLOCK_M, BLOCK_SIZE) + p = tl.exp(qk - m_j[:, None]) + # l_j : (BLOCK_M,) + l_j = tl.sum(p, 1) + # alpha : (BLOCK_M, ) + alpha = tl.exp(m_i - m_j) + # scale acc + acc = acc * alpha[:, None] + # update acc + if HEAD_DIM_PAD_REQ: + v = tl.load(v_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=dim_mask[None, :] & + ((start_n + offs_n[:, None]) < cur_batch_query_len), + other=0.0) + else: + v = tl.load(v_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=((start_n + offs_n[:, None]) < cur_batch_query_len), + other=0.0) + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION) + # update m_i and l_i + l_i = alpha * l_i + l_j + m_i = m_j + + acc = acc / l_i[:, None] + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + offs_d[None, :] * stride_od) + out_ptrs = Out + off_o + + if HEAD_DIM_PAD_REQ: + tl.store(out_ptrs, + acc, + mask=dim_mask[None, :] & + (offs_m[:, None] < cur_batch_query_len)) + else: + tl.store(out_ptrs, + acc, + mask=(offs_m[:, None] < cur_batch_query_len)) + return + + @triton.jit + def _fwd_kernel_alibi( + Q, + K, + V, + K_cache, + V_cache, + B_Loc, + sm_scale, + k_scale, + v_scale, + B_Start_Loc, + B_Seqlen, + Alibi_slopes, + block_size, + x, + Out, + stride_b_loc_b, + stride_b_loc_s, + stride_qbs, + stride_qh, + stride_qd, + stride_kbs, + stride_kh, + stride_kd, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + stride_k_cache_bs, + stride_k_cache_h, + stride_k_cache_d, + stride_k_cache_bl, + stride_k_cache_x, + stride_v_cache_bs, + stride_v_cache_h, + stride_v_cache_d, + stride_v_cache_bl, + num_queries_per_kv: int, + IN_PRECISION: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, # head size + BLOCK_DMODEL_PADDED: tl.constexpr, # head size padded to a power of 2 + BLOCK_N: tl.constexpr, + SKIP_DECODE: tl.constexpr, + ): + # attn_bias[] + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // num_queries_per_kv + + # cur_batch_seq_len: the length of prompts + # cur_batch_ctx_len: the length of prefix + # cur_batch_in_all_start_index: the start id of the dim=0 + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1) + cur_batch_query_len = (cur_batch_in_all_stop_index - + cur_batch_in_all_start_index) + cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len + + if SKIP_DECODE and cur_batch_query_len == 1: + return + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL_PADDED) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + offs_d[None, :] * stride_qd) + + dim_mask = tl.where( + tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1) + + q = tl.load(Q + off_q, + mask=dim_mask[None, :] & + (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len), + other=0.0) + + # # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32) + + alibi_slope = tl.load(Alibi_slopes + cur_head) + alibi_start_q = tl.arange( + 0, BLOCK_M) + block_start_loc + cur_batch_ctx_len + alibi_start_k = 0 + for start_n in range(0, cur_batch_ctx_len, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + + ((start_n + offs_n) // block_size) * stride_b_loc_s, + mask=(start_n + offs_n) < cur_batch_ctx_len, + other=0) + off_k = (bn[None, :] * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_n[None, :]) % block_size) * + stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x) + off_v = ( + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + + offs_d[None, :] * stride_v_cache_d + + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) + k_load = tl.load(K_cache + off_k, + mask=dim_mask[:, None] & + ((start_n + offs_n[None, :]) < cur_batch_ctx_len), + other=0.0) # [D,N] + + if k_load.dtype.is_fp8(): + k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype) + else: + k = k_load + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) + qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, + float("-inf")) + qk *= sm_scale + + # load alibi + alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - + alibi_start_q[:, None]) * alibi_slope + alibi = tl.where( + (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), + alibi, float("-inf")) + qk += alibi + alibi_start_k += BLOCK_N + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v_load = tl.load(V_cache + off_v, + mask=dim_mask[None, :] & + ((start_n + offs_n[:, None]) < cur_batch_ctx_len), + other=0.0) + if v_load.dtype.is_fp8(): + v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype) + else: + v = v_load + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision='ieee') + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + + offs_d[:, None] * stride_kd) + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + + offs_d[None, :] * stride_vd) + k_ptrs = K + off_k + v_ptrs = V + off_v + + block_mask = tl.where( + block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) + + # init alibi + alibi_slope = tl.load(Alibi_slopes + cur_head) + alibi_start_q = tl.arange( + 0, BLOCK_M) + block_start_loc + cur_batch_ctx_len + alibi_start_k = cur_batch_ctx_len + # # init debugger + # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc + # offset_db_k = tl.arange(0, BLOCK_N) + # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL] + for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load(k_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=dim_mask[:, None] & + ((start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len), + other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk = tl.dot(q, k, acc=qk, input_precision='ieee') + qk *= sm_scale + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, + float("-inf")) + + # load alibi + alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - + alibi_start_q[:, None]) * alibi_slope + alibi = tl.where( + (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), + alibi, float("-inf")) + qk += alibi + alibi_start_k += BLOCK_N + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(v_ptrs + + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=dim_mask[None, :] & + ((start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len), + other=0.0) + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision='ieee') + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + acc = acc / l_i[:, None] + + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + offs_d[None, :] * stride_od) + out_ptrs = Out + off_o + tl.store(out_ptrs, + acc, + mask=dim_mask[None, :] & + (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)) + return + + @functools.lru_cache + def find_block(a, b): + if a < 16 or b < 16: + return None + # 找到小于等于b的最大2的幂 + max_power = (b).bit_length() - 1 + # 从大到小检查2的幂 + for k in range(max_power, 3, -1): + power = 1 << k + if a % power == 0: + return power + return None + + @functools.lru_cache + def get_context_attention_fwd_config_filepath(cache_block_size, head_size, slide_window, + use_alibi_slopes, skip_decode, + kv_dtype, **kwargs) -> str: + kv_type = "auto" + if kv_dtype == torch.float8_e4m3fn or kv_dtype == torch.float8_e5m2: + kv_type = "fp8" + device_name = arch_info.get_arch() + head_size_padded = triton.next_power_of_2(head_size) + head_size_pad_need = head_size != head_size_padded + + kernel_name = "context_attention_fwd_alibi" if use_alibi_slopes else "context_attention_fwd" + json_file_name = ( + f"{kernel_name}-device={device_name}" + f"-block_size={cache_block_size}" + f"-BLOCK_DMODEL_PADDED={head_size_padded}" + f"-SLIDING_WINDOW={slide_window}" + f"-HEAD_DIM_PAD_REQ={head_size_pad_need}" + f"-kv_dtype={kv_type}.json" + ) + + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "context_attention_fwd", json_file_name + ) + return config_file_path + + @functools.lru_cache + def get_context_attention_fwd_config( + cache_block_size, + head_size, + max_input_len, + slide_window, + use_alibi_slopes, + skip_decode, + kv_dtype + ) -> Optional[Dict]: + config_file_path = get_context_attention_fwd_config_filepath(cache_block_size, head_size, + slide_window, use_alibi_slopes, + skip_decode, kv_dtype) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + configs = {int(key): val for key, val in json.load(f)["config"].items()} + if configs: + config = configs[min(configs.keys(), key=lambda x: abs(x - max_input_len))] + # logger.info(f"context_attention_fwd use kernel config from:{config_file_path}") + return config + + # If no optimized configuration is available, we will use the default + logger.warning( + f"\nUsing default context_attention_fwd kernel config. Performance might " + f"be sub-optimal! Config not found at {config_file_path}") + return None + + @torch.inference_mode() + def context_attention_fwd(q, + k, + v, + o, + kv_cache_dtype: str, + k_cache, + v_cache, + b_loc, + b_start_loc, + b_seq_len, + max_input_len, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + alibi_slopes=None, + sliding_window=None, + sm_scale=None, + skip_decode=False): + + q_dtype_is_f32 = q.dtype is torch.float32 + # shape constraints + Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] + assert Lq == Lk and Lk == Lv + # round up Lk to a power of 2 - this is required for Triton block size + Lk_padded = triton.next_power_of_2(Lk) + + # 0 means "disable" + if sliding_window is None or sliding_window <= 0: + sliding_window = 0 + use_alibi_slopes = False if alibi_slopes is None else True + # open when kernel tuned + # config = get_context_attention_fwd_config(v_cache.shape[3], Lk, max_input_len, + # sliding_window, use_alibi_slopes, + # skip_decode, k_cache.dtype) + config = None + if not config: + config = ({'num_warps': NUM_WARPS, 'num_stages': 1} + if use_alibi_slopes else + {'num_warps': NUM_WARPS, 'num_stages': 1, 'USE_MATRIX_LOAD': False}) + + if 'BLOCK_N' not in config or 'BLOCK_M' not in config: + BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK + config['BLOCK_M'] = BLOCK + + BLOCK = 64 + cache_ele_size = v_cache.element_size() + if BLOCK * Lk_padded * cache_ele_size > 16384: # 64 * 128 * 2 + BLOCK = 1 << ((16384 // (Lk_padded * cache_ele_size)).bit_length() - 1) + config['BLOCK_N'] = BLOCK + # print(f"context_attention_fwd: {config=}") + # Turing does have tensor core for float32 multiplication + # use ieee as fallback for triton kernels work. There is also + # warning on vllm/config.py to inform users this fallback + # implementation + IN_PRECISION = None + if sm_scale is None: + sm_scale = 1.0 / (Lq**0.5) + batch, head = b_seq_len.shape[0], q.shape[1] + num_queries_per_kv = q.shape[1] // k.shape[1] + + assert batch + 1 == len(b_start_loc) + grid = (batch, head, triton.cdiv(max_input_len, config['BLOCK_M'])) # batch, head, + + if "fp8" in kv_cache_dtype and (k_cache.dtype == torch.uint8 or v_cache.dtype == torch.uint8): + # kv_cache may view as uint8 + if kv_cache_dtype in ("fp8", "fp8e4m3"): + target_dtype = torch.float8_e4m3fn + elif kv_cache_dtype == "fp8e5m2": + target_dtype = torch.float8_e5m2 + else: + raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype) + k_cache = k_cache.view(target_dtype) + v_cache = v_cache.view(target_dtype) + + if alibi_slopes is not None: + _fwd_kernel_alibi[grid]( + q, + k, + v, + k_cache, + v_cache, + b_loc, + sm_scale, + k_scale, + v_scale, + b_start_loc, + b_seq_len, + alibi_slopes, + v_cache.shape[3], + k_cache.shape[4], + o, + b_loc.stride(0), + b_loc.stride(1), + q.stride(0), + q.stride(1), + q.stride(2), + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + k_cache.stride(0), + k_cache.stride(1), + k_cache.stride(2), + k_cache.stride(3), + k_cache.stride( + 4 + ), #[num_blocks, num_kv_heads, head_size/x, block_size, x] + v_cache.stride(0), + v_cache.stride(1), + v_cache.stride(2), + v_cache.stride( + 3), #[num_blocks, num_kv_heads, head_size, block_size] + num_queries_per_kv=num_queries_per_kv, + IN_PRECISION=IN_PRECISION, + BLOCK_DMODEL=Lk, + BLOCK_DMODEL_PADDED=Lk_padded, + SKIP_DECODE=skip_decode, + **config, + ) + return + + _fwd_kernel[grid]( + q, + k, + v, + k_cache, + v_cache, + b_loc, + sm_scale, + k_scale, + v_scale, + b_start_loc, + b_seq_len, + v_cache.shape[3], + k_cache.shape[4], + o, + b_loc.stride(0), + b_loc.stride(1), + q.stride(0), + q.stride(1), + q.stride(2), + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + k_cache.stride(0), + k_cache.stride(1), + k_cache.stride(2), + k_cache.stride(3), + k_cache.stride( + 4), #[num_blocks, num_kv_heads, head_size/x, block_size, x] + v_cache.stride(0), + v_cache.stride(1), + v_cache.stride(2), + v_cache.stride( + 3), #[num_blocks, num_kv_heads, head_size, block_size] + num_queries_per_kv=num_queries_per_kv, + IN_PRECISION=IN_PRECISION, + BLOCK_DMODEL=Lk, + BLOCK_DMODEL_PADDED=Lk_padded, + SLIDING_WINDOW=sliding_window, + SKIP_DECODE=skip_decode, + HEAD_DIM_PAD_REQ=(Lk != Lk_padded), + max_input_len=max_input_len, + **config, + ) + return diff --git a/aiter/ops/triton/pod_attention.py b/aiter/ops/triton/pod_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..4ff85174a3e4faeef3e92086ffcf205ce975a602 --- /dev/null +++ b/aiter/ops/triton/pod_attention.py @@ -0,0 +1,560 @@ +import torch +import triton +import triton.language as tl + +import importlib.util +from pathlib import Path + +file_path = Path("./aiter/ops/triton/lean_atten.py").resolve() +module_name = "la_persistent" +spec = importlib.util.spec_from_file_location(module_name, file_path) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) + + +def pod_attention( + cu_ctr: torch.Tensor, + # Decode + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + Mp: torch.Tensor, + Lp: torch.Tensor, + Op: torch.Tensor, + locks: torch.Tensor, + batch_num_block_n: torch.Tensor, + total_programs: int, + BLOCK_M: int, + BLOCK_N: int, + # causal: bool, + batch_size: int, + sm_scale: torch.float16, + num_warps, + waves_per_eu, + # Prefill + q_pf: torch.Tensor, + k_pf: torch.Tensor, + v_pf: torch.Tensor, + Mp_pf: torch.Tensor, + Lp_pf: torch.Tensor, + Op_pf: torch.Tensor, + locks_pf: torch.Tensor, + batch_num_block_n_pf: torch.Tensor, + BLOCK_M_pf: int, + BLOCK_N_pf: int, + # causal_pf: bool, + batch_size_pf: int, + prefill_ratio: int, + decode_ratio: int, +): + # shape constraints + HEAD_DIM_Q, HEAD_DIM_K, HEAD_DIM_V = q.shape[-1], k.shape[-1], v.shape[-1] + assert ( + HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V + ), "Incompatible Q/K/V Hidden Dimensions" + assert HEAD_DIM_K in {16, 32, 64, 128, 256} + + # Calculate Decode Params + N_CTX_Q = q.shape[0] // batch_size + N_CTX_K = k.shape[0] # This is the sum of all ctx_n in a batch + H = q.shape[1] + + qk_scale = sm_scale * 1.44269504 + + # We assume the kernel functions fused by pod attention are persistent kernel functions + total_wgs = total_programs // 2 + + ( + num_m_blocks, + num_n_blocks, + high_load_wgs, + max_tiles_per_wg, + tiles_per_head, + num_splits, + even_split, + ) = get_num_splits_and_buffer_sizes( + False, # causal + batch_size, + N_CTX_Q, + N_CTX_K, + H, + H, + BLOCK_M, + BLOCK_N, + total_wgs, + ) + # print(" Decode LA params") + # print(f" num_m_blocks={num_m_blocks}, high_load_wgs={high_load_wgs}, max_tiles_per_wg={max_tiles_per_wg}") + # print(f" tiles_per_head={tiles_per_head}, total_wgs={total_wgs}") + + o = torch.empty_like(q, dtype=v.dtype) + + # Calculate Prefill Params + N_CTX_Q_pf = q_pf.shape[0] // batch_size + N_CTX_K_pf = k_pf.shape[0] # This is the sum of all ctx_n in a batch + + # MASKED_BLOCKS is used for prefill/causal for BLOCK_M > BLOCK_N + MASKED_BLOCKS = BLOCK_M_pf // BLOCK_N_pf + + # if causal_pf: + # Only support BLOCK_M is multiple of BLOCK_N + # TODO: add other scenarios + assert BLOCK_M_pf % BLOCK_N_pf == 0 + + # num_m_blocks_pf, high_load_wgs_pf, max_tiles_per_wg_pf, tiles_per_head_pf, num_splits_pf, even_split_pf = ( + # get_num_splits_and_buffer_sizes(causal_pf, N_CTX_Q_pf, N_CTX_K_pf, H, H, HEAD_DIM_Q, BLOCK_M_pf, BLOCK_N_pf, total_programs) + # ) + ( + num_m_blocks_pf, + num_n_blocks_pf, + high_load_wgs_pf, + max_tiles_per_wg_pf, + tiles_per_head_pf, + num_splits_pf, + even_split_pf, + ) = get_num_splits_and_buffer_sizes( + True, # causal, + batch_size_pf, + N_CTX_Q_pf, + N_CTX_K_pf, + H, + H, + BLOCK_M_pf, + BLOCK_N_pf, + total_wgs, + ) + print("\n Prefill LA params") + print( + f" num_m_blocks={num_m_blocks_pf}, high_load_wgs={high_load_wgs_pf}, max_tiles_per_wg={max_tiles_per_wg_pf}" + ) + print(f" tiles_per_head={tiles_per_head_pf}, total_wgs={total_wgs}") + print( + f" BLOCK_M_pf={BLOCK_M_pf}, BLOCK_N_pf={BLOCK_N_pf}, MASKED_BLOCKS={MASKED_BLOCKS}" + ) + print( + f" batch_size_pf={batch_size_pf}, num_m_blocks_pf={num_m_blocks_pf}, num_n_blocks_pf={num_n_blocks_pf}" + ) + + print(f" Launching {total_programs} of kernels") + + grid = (total_programs, 1, 1) + + o_pf = torch.empty_like(q_pf, dtype=v_pf.dtype) + + pod_kernel = pod_persistent[grid]( + cu_ctr, + # Decode positional arguments + q, + k, + v, + qk_scale, + Mp, + Lp, + Op, + o, + batch_num_block_n, + locks, + q.stride(0), # N_CTX_Q + q.stride(1), # H + q.stride(2), # HEAD_DIM + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + Op.stride(0), # total_programs + Op.stride(1), # N_CTX_Q + Op.stride(2), # HEAD_DIM + # Prefill positional arguments + q_pf, + k_pf, + v_pf, + Mp_pf, + Lp_pf, + Op_pf, + o_pf, + batch_num_block_n_pf, + locks_pf, + q_pf.stride(0), + q_pf.stride(1), + q_pf.stride(2), + k_pf.stride(0), + k_pf.stride(1), + k_pf.stride(2), + v_pf.stride(0), + v_pf.stride(1), + v_pf.stride(2), + o_pf.stride(0), + o_pf.stride(1), + o_pf.stride(2), + Op_pf.stride(0), + Op_pf.stride(1), + Op_pf.stride(2), + # Decode keyword argument + HEAD_DIM=HEAD_DIM_K, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + batch_size=batch_size, + num_m_blocks=num_m_blocks, + num_n_blocks=num_n_blocks, + # leanAttention params + high_load_wgs=high_load_wgs, + max_tiles_per_wg=max_tiles_per_wg, + tiles_per_head=tiles_per_head, + num_splits=num_splits, + waves_per_eu=waves_per_eu, + num_warps=num_warps, + # Prefill keyword argument + # HEAD_DIM=HEAD_DIM_K, + BLOCK_M_pf=BLOCK_M_pf, + BLOCK_N_pf=BLOCK_N_pf, + MASKED_BLOCKS=MASKED_BLOCKS, + batch_size_pf=batch_size_pf, + # causal_pf=causal_pf, + num_m_blocks_pf=num_m_blocks_pf, + num_n_blocks_pf=num_n_blocks_pf, + # leanAttention params + high_load_wgs_pf=high_load_wgs_pf, + max_tiles_per_wg_pf=max_tiles_per_wg_pf, + tiles_per_head_pf=tiles_per_head_pf, + num_splits_pf=num_splits_pf, + prefill_ratio=prefill_ratio, + decode_ratio=decode_ratio, + ) + # torch.cuda.synchronize() + print( + f"pod kernel {pod_kernel.n_regs} registers used, {pod_kernel.n_spills} spills" + ) + + return o, o_pf + + +def get_num_splits_and_buffer_sizes( + causal, + batch_size, + max_seqlen_q, + max_seqlen_k, + num_heads, + num_heads_k, + BLOCK_M, + BLOCK_N, + num_SMs, +): + ##### Lean Atteion: Calculate Splits and Tile Sizes ##### + ## based on onnxruntime/contrib_ops/cuda/bert/lean_attention + num_m_blocks = (max_seqlen_q + BLOCK_M - 1) // BLOCK_M + num_n_blocks = (max_seqlen_k + BLOCK_N - 1) // BLOCK_N + + # TODO: Support Grouped-Query Attention + max_seqlen_q = max_seqlen_q * num_heads // num_heads_k + + # print(f"block_m: {BLOCK_M}, block_n: {BLOCK_N} ") + # print(f"num_m_block: {num_m_blocks}, num_n_block: {num_n_blocks} ") + # print(f"max_seqlen_q: {max_seqlen_q}, max_seqlen_k: {max_seqlen_k}") + # print(f"num_heads: {num_heads}, num_heads_k: {num_heads_k} ") + # print(f"num_SMs: {num_SMs}") + + if max_seqlen_q == 1: + causal = False + + tiles_per_head = 0 + if causal: + # Prefill - Causal + for i in range(0, num_m_blocks): + tiles_per_head += (((i + 1) * BLOCK_M) + BLOCK_N - 1) // BLOCK_N + else: + # Decode or Not Causal + tiles_per_head = num_m_blocks * num_n_blocks + + total_tiles = tiles_per_head * num_heads_k # Total tiles across all heads + + # StreamK Lean has as many threadblocks as SMs + # This should be a function of tile size and number of scratchpad space + # LeanAttention assign 2 tiles per CTA and 2 CTAs per SM + lean_griddimz = num_SMs # CTA launch grid + # if (total_tiles <= 2 * 2 * num_SMs): + # lean_griddimz = min((total_tiles + 1) / 2, (32 * total_tiles + num_n_blocks - 1) / num_n_blocks) + # else: + # lean_griddimz = min(2 * num_SMs, 32 * num_heads_k * batch_size * num_m_blocks) + + # Max number lean tiles per task block (CTA) + max_tiles_per_tb = (total_tiles + lean_griddimz - 1) // lean_griddimz + + # Find max number of splits + num_splits = 0 + even_split = False + if total_tiles % lean_griddimz == 0: + even_split = True + num_splits = 1 + ((num_n_blocks + max_tiles_per_tb - 2) // (max_tiles_per_tb)) + else: + even_split = False + num_splits = 1 + ( + (num_n_blocks + max_tiles_per_tb - 3) // (max_tiles_per_tb - 1) + ) + + # high_load_tbs is the remainder of total_tile / num_cta + high_load_tbs = total_tiles - ((max_tiles_per_tb - 1) * lean_griddimz) + + # Needed for causal. This is (per batch n_ctx) // BLOCK_N + num_n_blocks = num_n_blocks // batch_size + + # print(f"total_tiles={total_tiles}, max_tiles_per_tb={max_tiles_per_tb}, high_load_tbs={high_load_tbs}") + return ( + num_m_blocks, + num_n_blocks, + high_load_tbs, + max_tiles_per_tb, + tiles_per_head, + num_splits, + even_split, + ) + + +@triton.jit +def read_realtime(): + tmp = tl.inline_asm_elementwise( + asm="""s_waitcnt vmcnt(0) + s_memrealtime $0 + s_waitcnt lgkmcnt(0)""", + constraints=("=s"), + args=[], + dtype=tl.int64, + is_pure=False, + pack=1, + ) + return tmp + + +@triton.jit +def get_cu_id(): + (cu_id, se_id, xcc_id) = tl.inline_asm_elementwise( + asm=""" + s_getreg_b32 $0, hwreg(HW_REG_HW_ID, 8, 4) + s_getreg_b32 $1, hwreg(HW_REG_HW_ID, 13, 2) + s_getreg_b32 $2, hwreg(HW_REG_XCC_ID, 0, 4) + s_waitcnt lgkmcnt(0) + """, + constraints=("=s,=s,=s"), # Three scalar output + args=[], # No inputs + dtype=(tl.int32, tl.int32, tl.int32), # Output type is int32 + is_pure=False, + pack=1, + ) + return (cu_id, se_id, xcc_id) + + +@triton.jit +def pod_persistent( + # Prefill/Decode Communication + cu_ctr, + # Decode + Q, + K, + V, + qk_scale, + Mp, + Lp, + Op, + Out, + batch_num_block_n, + locks, + stride_qm, # n_ctx_q + stride_qh, # Head + stride_qk, # head_dim + stride_kn, + stride_kh, + stride_kk, + stride_vn, + stride_vh, + stride_vk, + stride_om, # n_ctx_q + stride_oh, # Head + stride_on, # head_dim + stride_oph, # total_programs + stride_opm, # n_ctx_q + stride_opn, # head_dim + # Prefill + Q_pf, + K_pf, + V_pf, + Mp_pf, + Lp_pf, + Op_pf, + Out_pf, + batch_num_block_n_pf, + locks_pf, + stride_qm_pf, + stride_qh_pf, + stride_qk_pf, + stride_kn_pf, + stride_kh_pf, + stride_kk_pf, + stride_vn_pf, + stride_vh_pf, + stride_vk_pf, + stride_om_pf, + stride_oh_pf, + stride_on_pf, + stride_oph_pf, + stride_opm_pf, + stride_opn_pf, + # Decode + HEAD_DIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + batch_size: tl.constexpr, + num_m_blocks: tl.constexpr, + num_n_blocks: tl.constexpr, + # leanAttention params + high_load_wgs: tl.constexpr, + max_tiles_per_wg: tl.constexpr, + tiles_per_head: tl.constexpr, + num_splits: tl.constexpr, + # Prefill + # HEAD_DIM: tl.constexpr, + BLOCK_M_pf: tl.constexpr, + BLOCK_N_pf: tl.constexpr, + MASKED_BLOCKS: tl.constexpr, + batch_size_pf: tl.constexpr, + # causal: tl.constexpr, + num_m_blocks_pf: tl.constexpr, + num_n_blocks_pf: tl.constexpr, + # leanAttention params + high_load_wgs_pf: tl.constexpr, + max_tiles_per_wg_pf: tl.constexpr, + tiles_per_head_pf: tl.constexpr, + num_splits_pf: tl.constexpr, + # Prefill/Decode common + prefill_ratio: tl.constexpr, + decode_ratio: tl.constexpr, +): + + # cu_id: 4 bits, se_id: 2 bits, xcc_id: 4 bits + (cu_id, se_id, xcc_id) = get_cu_id() + gcu_id = (xcc_id << 6) + (se_id << 4) + cu_id + # tl.device_print("gcu_id is ", gcu_id) + + # cu_ctr is initialized to zero + # tl.atomic_add(cu_ctr + gcu_id, 1) + ratio = prefill_ratio + decode_ratio + op = 0 # 0 - decode + ticket = (tl.atomic_add(cu_ctr + gcu_id, 1)) % ratio + # ticket=tl.atomic_add(cu_ctr,1) + # if ticket >= 304: + # op=1 + if ticket < prefill_ratio: + op = 1 # 1 - prefill + + current_pid = tl.program_id(0) % 304 + # if gcu_id==352: + # tl.device_print("ticket is", ticket) + # tl.device_print("op is ", op) + # tl.device_print("op is:", op) + if op == 0: # 0 - decode + # decode_time = read_realtime() + # if gcu_id==0: + # tl.device_print("time to start decode kernel", decode_time) + module.la_persistent( + True, + current_pid, + Q, + K, + V, + qk_scale, + Mp, + Lp, + Op, + Out, + batch_num_block_n, + locks, + stride_qm, + stride_qh, + stride_qk, + stride_kn, + stride_kh, + stride_kk, + stride_vn, + stride_vh, + stride_vk, + stride_om, + stride_oh, + stride_on, + stride_oph, + stride_opm, + stride_opn, + HEAD_DIM, #: tl.constexpr, + BLOCK_M, #: tl.constexpr, + BLOCK_N, #: tl.constexpr, + MASKED_BLOCKS, + batch_size, #: tl.constexpr, + False, # tl.constexpr, + num_m_blocks, #: tl.constexpr, + num_n_blocks, + # leanAttention params + high_load_wgs, #: tl.constexpr, + max_tiles_per_wg, #: tl.constexpr, + tiles_per_head, #: tl.constexpr, + num_splits, #: tl.constexpr, + ) + tl.debug_barrier() + # decode_time = read_realtime() - decode_time + # if gcu_id==0: + # tl.device_print("time to run decode", decode_time) + + # tl.device_print("gcu_id for decode", gcu_id) + else: + # prefill_time = read_realtime() + # if gcu_id==0: + # tl.device_print("time to start prefill kernel", prefill_time) + # tl.device_print("gcu_id start prefill kernel", gcu_id) + module.la_persistent( + True, + current_pid, + Q_pf, + K_pf, + V_pf, + qk_scale, + Mp_pf, + Lp_pf, + Op_pf, + Out_pf, + batch_num_block_n_pf, + locks_pf, + stride_qm_pf, + stride_qh_pf, + stride_qk_pf, + stride_kn_pf, + stride_kh_pf, + stride_kk_pf, + stride_vn_pf, + stride_vh_pf, + stride_vk_pf, + stride_om_pf, + stride_oh_pf, + stride_on_pf, + stride_oph_pf, + stride_opm_pf, + stride_opn_pf, + HEAD_DIM, #: tl.constexpr, + BLOCK_M_pf, #: tl.constexpr, + BLOCK_N_pf, #: tl.constexpr, + MASKED_BLOCKS, + batch_size_pf, #: tl.constexpr, + True, # causaltl.constexpr, + num_m_blocks_pf, #: tl.constexpr, + num_n_blocks_pf, + # leanAttention params + high_load_wgs_pf, #: tl.constexpr, + max_tiles_per_wg_pf, #: tl.constexpr, + tiles_per_head_pf, #: tl.constexpr, + num_splits_pf, #: tl.constexpr, + ) + tl.debug_barrier() + # prefill_time = read_realtime() - prefill_time + # if gcu_id==0: + # tl.device_print("time to run prefill kernel", prefill_time) + # tl.device_print("gcu_id for prefill", gcu_id) diff --git a/aiter/ops/triton/prefill_attention.py b/aiter/ops/triton/prefill_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..9d7c335deb92928adee04fc5015fa120983bd8ba --- /dev/null +++ b/aiter/ops/triton/prefill_attention.py @@ -0,0 +1,220 @@ +# SPDX-License-Identifier: MIT + +# Copyright (C) 2023-2025 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +Memory-efficient attention for prefill. +It supporst page size = 1. +""" + +# Adapted from +# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1 +import triton +import triton.language as tl + + +def is_hip(): + return triton.runtime.driver.active.get_current_target().backend == "hip" + + +_is_cuda = False + +_is_hip = is_hip() + + +@triton.jit +def _fwd_kernel( + Q, + K, + V, + sm_scale, + B_Start_Loc, + B_Seqlen, + Out, + stride_qbs, + stride_qh, + stride_kbs, + stride_kh, + stride_vbs, + stride_vh, + stride_obs, + stride_oh, + kv_group_num: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + IS_CAUSAL: tl.constexpr, + Lk: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // kv_group_num + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + + offs_d[None, :] + ) + off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] + off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] + + mask_d = offs_d < Lk + + q = tl.load( + Q + off_q, + mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]), + other=0.0, + ) + + k_ptrs = K + off_k + v_ptrs = V + off_v + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + + block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) + + end_n = ( + cur_batch_seq_len + if not IS_CAUSAL + else tl.minimum((start_m + 1) * BLOCK_M, cur_batch_seq_len) + ) + for start_n in range(0, block_mask * end_n, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load( + k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]), + other=0.0, + ) + # mask = tl.load(mask_ptrs + start_n, mask=start_n + offs_n < cur_batch_end_loc, other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk *= sm_scale + + if IS_CAUSAL: + qk += tl.where( + (start_n + offs_n[None, :] < cur_batch_seq_len) + & (offs_m[:, None] >= (start_n + offs_n[None, :])), + 0, + float("-inf"), + ) + else: + qk += tl.where( + (start_n + offs_n[None, :]) < cur_batch_seq_len, 0, float("-inf") + ) + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + m_i_new = tl.maximum(m_i, m_ij) + alpha = tl.exp(m_i - m_i_new) + beta = tl.exp(m_ij - m_i_new) + l_i_new = alpha * l_i + beta * l_ij + # -- update output accumulator -- + # scale p + p_scale = beta / l_i_new + p = p * p_scale[:, None] + # scale acc + acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load( + v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]), + other=0.0, + ) + + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + + offs_d[None, :] + ) + out_ptrs = Out + off_o + tl.store( + out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]) + ) + + +def context_attention_fwd( + q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True +): + """ + q, k, v: [b * s, head, head_dim] + b_start_loc: [b] + b_seq_len: [b] + out: [b * s, head, head_dim] + """ + if _is_hip: + BLOCK = 128 + else: + BLOCK = 64 + + Lq, Lk = q.shape[-1], k.shape[-1] + + sm_scale = 1.0 / (Lq**0.5) + batch, head = b_seq_len.shape[0], q.shape[1] + kv_group_num = q.shape[1] // k.shape[1] + + grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) + num_warps = 4 if Lk <= 64 else 8 + + _fwd_kernel[grid]( + q, + k, + v, + sm_scale, + b_start_loc, + b_seq_len, + o, + q.stride(0), + q.stride(1), + k.stride(0), + k.stride(1), + v.stride(0), + v.stride(1), + o.stride(0), + o.stride(1), + kv_group_num=kv_group_num, + BLOCK_M=BLOCK, + BLOCK_DMODEL=triton.next_power_of_2(Lk), + BLOCK_N=BLOCK, + IS_CAUSAL=is_causal, + num_warps=num_warps, + num_stages=1, + Lk=Lk, + ) diff --git a/aiter/ops/triton/quant.py b/aiter/ops/triton/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..ecdf2809f8e79179ca98eeae877e342b9ee8c695 --- /dev/null +++ b/aiter/ops/triton/quant.py @@ -0,0 +1,428 @@ +# SPDX-License-Identifier: MIT + +import triton +import triton.language as tl +import torch + + +@triton.jit +def _static_per_tensor_quant_fp8_i8_kernel( + qx_ptr, + x_in_ptr, + scale_in_ptr, + cols: int, + x_in_stride_r: int, + NUM_COL_POW2: tl.constexpr, +): + pid = tl.program_id(axis=0) + tl.assume(pid > 0) + tl.assume(x_in_stride_r > 0) + + offs = pid * x_in_stride_r + tl.arange(0, NUM_COL_POW2) + mask = tl.arange(0, NUM_COL_POW2) < cols + x = tl.load(x_in_ptr + offs, mask=mask, cache_modifier=".cg") + + scale = tl.load(scale_in_ptr) + scale_recip = 1 / scale + + qx = (x * scale_recip).to(qx_ptr.dtype.element_ty) + + tl.store(qx_ptr + offs, qx, mask=mask) + + +def static_per_tensor_quant_fp8_i8( + qx: torch.Tensor, x_in: torch.Tensor, scale_in: torch.Tensor +): + """ + Quantizes tensor using the provided scale to int8 or fp8 + + Parameters: + - qx: Output tensor of same shape as x_in. Must be fp8 or int8 dtype and allocated by the caller + - x_in: Input tensor of shape (M, N). + - scale_in: Input Scale tensor of shape (1,) and dtype fp32 + + Returns: + - qx: Quantized output values. + """ + assert scale_in.numel() == 1 # only single scale value + rows = x_in.shape[0] + cols = x_in.shape[1] + NUM_COL_POW2 = triton.next_power_of_2(cols) + grid = lambda meta: (rows,) # noqa: E731 + _static_per_tensor_quant_fp8_i8_kernel[grid]( + qx, x_in, scale_in, cols, x_in.stride(0), NUM_COL_POW2=NUM_COL_POW2 + ) + + return qx + + +@triton.jit +def _dynamic_per_tensor_quant_fp8_i8_kernel( + x_in_ptr, + scale_out_ptr, + cols: int, + x_in_stride_r: int, + NUM_COL_POW2: tl.constexpr, + DTYPE_MAX: tl.constexpr, +): + pid = tl.program_id(axis=0) + tl.assume(pid > 0) + tl.assume(x_in_stride_r > 0) + + offs = pid * x_in_stride_r + tl.arange(0, NUM_COL_POW2) + mask = tl.arange(0, NUM_COL_POW2) < cols + x = tl.load(x_in_ptr + offs, mask=mask, cache_modifier=".cg") + + m = tl.max(tl.abs(x)) + tl.atomic_max(scale_out_ptr, m / DTYPE_MAX, sem="relaxed") + + +def dynamic_per_tensor_quant_fp8_i8( + qx: torch.Tensor, x_in: torch.Tensor, scale_out: torch.Tensor +): + """ + Calculate per tensor scale and then uses the scale to quantize input tensor to fp8 or int8 + + Parameters: + - x_in: Input tensor of shape (M, N). + - qx: Output tensor of same shape as x_in. Must be fp8 or int8 dtype and allocated by the caller + - scale_out: Output scale tensor of shape (1,), dtype fp32 and allocated by the caller + + Returns: + - qx: Quantized output values of shape (M, N) with dtype fp8 or int8 + - scale_out: Single scale value of shape (1,) + """ + + rows = x_in.shape[0] + cols = x_in.shape[1] + NUM_COL_POW2 = triton.next_power_of_2(cols) + grid = lambda meta: (rows,) # noqa: E731 + _dynamic_per_tensor_quant_fp8_i8_kernel[grid]( + x_in, + scale_out, + cols, + x_in.stride(0), + NUM_COL_POW2=NUM_COL_POW2, + DTYPE_MAX=( + torch.finfo(qx.dtype).max + if torch.is_floating_point(qx) + else torch.iinfo(qx.dtype).max + ), + ) + + _static_per_tensor_quant_fp8_i8_kernel[grid]( + qx, x_in, scale_out, cols, x_in.stride(0), NUM_COL_POW2=NUM_COL_POW2 + ) + + return qx, scale_out + + +@triton.jit +def _dynamic_per_token_quant_fp8_i8_kernel( + qx_ptr, + scale_out_ptr, + x_in_ptr, + cols: int, + x_in_stride_r: int, + NUM_COL_POW2: tl.constexpr, + DTYPE_MAX: tl.constexpr, +): + pid = tl.program_id(axis=0) + tl.assume(pid > 0) + tl.assume(x_in_stride_r > 0) + + offs = pid * x_in_stride_r + tl.arange(0, NUM_COL_POW2) + mask = tl.arange(0, NUM_COL_POW2) < cols + x = tl.load(x_in_ptr + offs, mask=mask, cache_modifier=".cg") + + m = tl.max(tl.abs(x), axis=-1) + scale_out = m.to(tl.float32) / DTYPE_MAX + scale_recip = 1 / scale_out + + qx = x * scale_recip + qx = qx.to(qx_ptr.dtype.element_ty) + + scale_offs = pid + tl.store(scale_out_ptr + scale_offs, scale_out) + + tl.store(qx_ptr + offs, qx, mask=mask, cache_modifier=".cs") + + +def dynamic_per_token_quant_fp8_i8( + qx: torch.Tensor, + x_in: torch.Tensor, + scale_out: torch.Tensor, +): + """ + Quantizes tensor using the provided scale + + Parameters: + - x_in: Input tensor of shape (M, N). + - dtype_max: Optional parameter which specifies the max value of the dtype of x_in. + - qx: Output tensor of same shape as x_in. Must be fp8 dtype and allocated by the caller + - scale_out: Output scale tensor of shape (M,) dtype fp32 and allocated by the caller + + Returns: + - qx: Quantized output values. + - scale_out: Scale tensor of shape (M, ) + """ + rows = x_in.shape[0] + cols = x_in.shape[1] + NUM_COL_POW2 = triton.next_power_of_2(cols) + grid = lambda meta: (rows,) # noqa: E731 + _dynamic_per_token_quant_fp8_i8_kernel[grid]( + qx, + scale_out, + x_in, + cols, + x_in.stride(0), + NUM_COL_POW2=NUM_COL_POW2, + DTYPE_MAX=( + torch.finfo(qx.dtype).max + if torch.is_floating_point(qx) + else torch.iinfo(qx.dtype).max + ), + ) + + return qx, scale_out + + +@triton.jit +def _mxfp4_quant_op( + x, + BLOCK_SIZE_N, + BLOCK_SIZE_M, + MXFP4_QUANT_BLOCK_SIZE, +): + """ + Converts given x (in fp32) to mxfp4 format. + x: [BLOCK_SIZE_M, BLOCK_SIZE_N], fp32 + + """ + NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE_N // MXFP4_QUANT_BLOCK_SIZE + x = x.reshape(BLOCK_SIZE_M, NUM_QUANT_BLOCKS, MXFP4_QUANT_BLOCK_SIZE) + # Calculate scale + amax = tl.max(tl.abs(x), axis=-1, keep_dims=True) + amax = amax.to(tl.int32, bitcast=True) + amax = (amax + 0x200000).to(tl.uint32, bitcast=True) & 0xFF800000 + amax = amax.to(tl.float32, bitcast=True) + scale_e8m0_unbiased = tl.log2(amax).floor() - 2 + scale_e8m0_unbiased = tl.clamp(scale_e8m0_unbiased, min=-127, max=127) + + # blockscale_e8m0 + bs_e8m0 = scale_e8m0_unbiased.to(tl.uint8) + 127 # in fp32, we have 2&(e - 127) + + quant_scale = tl.exp2(-scale_e8m0_unbiased) + + # Compute quantized x + qx = x * quant_scale + + # Convert quantized fp32 tensor to uint32 before converting to mxfp4 format + # Note: MXFP4 S:1-bit, E:2-bit, M:1-bit + # Zeros: S000 -> +/-0 + # Denormal Numbers: S001 -> +/- 0.5 + # Normal Numbers: + # S010 -> +/- 1.0 + # S011 -> +/- 1.5 + # S100 -> +/- 2.0 + # S101 -> +/- 3.0 + # S110 -> +/- 4.0 + # S111 -> +/- 6.0 + qx = qx.to(tl.uint32, bitcast=True) + + # Extract sign, exponents and mantissa fields from FP32 + s = qx & 0x80000000 + e = (qx >> 23) & 0xFF + m = qx & 0x7FFFFF + E8_BIAS: tl.constexpr = 127 + E2_BIAS: tl.constexpr = 1 + + # Denormal numbers + # If exponent is less than 127, then it's a denormal number + # See above, for denormal number mantissa is always 1 and we set bit 1 of mantissa + adjusted_exponents = tl.core.sub(E8_BIAS, e + 1, sanitize_overflow=False) + m = tl.where(e < E8_BIAS, (0x400000 | (m >> 1)) >> adjusted_exponents, m) + # For normal numbers, bias is changed from 127 to 1, and for subnormals, we keep exponent as 0. + # Note: E8_BIAS - E2_BIAS = 126, so for normals we subtract that. + e = tl.maximum(e, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS) + + # Combine sign, exponent, and mantissa, while saturating + # rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right + e2m1_tmp = tl.minimum((((e << 2) | (m >> 21)) + 1) >> 1, 0x7) + e2m1_value = ((s >> 28) | e2m1_tmp).to(tl.uint8) + e2m1_value = tl.reshape( + e2m1_value, [BLOCK_SIZE_M, NUM_QUANT_BLOCKS, MXFP4_QUANT_BLOCK_SIZE // 2, 2] + ) + evens, odds = tl.split(e2m1_value) + x_fp4 = evens | (odds << 4) + x_fp4 = x_fp4.reshape(BLOCK_SIZE_M, BLOCK_SIZE_N // 2) + + return x_fp4, bs_e8m0.reshape(BLOCK_SIZE_M, NUM_QUANT_BLOCKS) + + +@triton.heuristics( + { + "EVEN_M_N": lambda args: args["M"] % args["BLOCK_SIZE_M"] == 0 + and args["N"] % (args["BLOCK_SIZE_N"] * args["NUM_ITER"]) == 0, + } +) +@triton.jit +def _dynamic_mxfp4_quant_kernel( + x_ptr, + x_fp4_ptr, + bs_ptr, + stride_x_m_in, + stride_x_n_in, + stride_x_fp4_m_in, + stride_x_fp4_n_in, + stride_bs_m_in, + stride_bs_n_in, + M, + N, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + NUM_ITER: tl.constexpr, + NUM_STAGES: tl.constexpr, + MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, + EVEN_M_N: tl.constexpr, + SCALING_MODE: tl.constexpr, +): + pid_m = tl.program_id(0) + start_n = tl.program_id(1) * NUM_ITER + # cast strides to int64, in case M*N > max int32 + stride_x_m = tl.cast(stride_x_m_in, tl.int64) + stride_x_n = tl.cast(stride_x_n_in, tl.int64) + stride_x_fp4_m = tl.cast(stride_x_fp4_m_in, tl.int64) + stride_x_fp4_n = tl.cast(stride_x_fp4_n_in, tl.int64) + stride_bs_m = tl.cast(stride_bs_m_in, tl.int64) + stride_bs_n = tl.cast(stride_bs_n_in, tl.int64) + + NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE_N // MXFP4_QUANT_BLOCK_SIZE + + for pid_n in tl.range(start_n, min(start_n + NUM_ITER, N), num_stages=NUM_STAGES): + x_offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + x_offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + x_offs = x_offs_m[:, None] * stride_x_m + x_offs_n[None, :] * stride_x_n + + if EVEN_M_N: + x = tl.load(x_ptr + x_offs, cache_modifier=".cg").to(tl.float32) + else: + x_mask = (x_offs_m < M)[:, None] & (x_offs_n < N)[None, :] + x = tl.load(x_ptr + x_offs, mask=x_mask, cache_modifier=".cg").to( + tl.float32 + ) + + out_tensor, bs_e8m0 = _mxfp4_quant_op( + x, BLOCK_SIZE_N, BLOCK_SIZE_M, MXFP4_QUANT_BLOCK_SIZE + ) + + out_offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + out_offs_n = pid_n * BLOCK_SIZE_N // 2 + tl.arange(0, BLOCK_SIZE_N // 2) + out_offs = ( + out_offs_m[:, None] * stride_x_fp4_m + out_offs_n[None, :] * stride_x_fp4_n + ) + + if EVEN_M_N: + tl.store(x_fp4_ptr + out_offs, out_tensor) + else: + out_mask = (out_offs_m < M)[:, None] & (out_offs_n < (N // 2))[None, :] + tl.store(x_fp4_ptr + out_offs, out_tensor, mask=out_mask) + + bs_offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + bs_offs_n = pid_n * NUM_QUANT_BLOCKS + tl.arange(0, NUM_QUANT_BLOCKS) + bs_offs = bs_offs_m[:, None] * stride_bs_m + bs_offs_n[None, :] * stride_bs_n + if EVEN_M_N: + tl.store(bs_ptr + bs_offs, bs_e8m0) + else: + bs_mask = (bs_offs_m < M)[:, None] & ( + bs_offs_n < (N + MXFP4_QUANT_BLOCK_SIZE - 1) // MXFP4_QUANT_BLOCK_SIZE + )[None, :] + tl.store( + bs_ptr + bs_offs, + bs_e8m0, + mask=bs_mask, + ) + + +def dynamic_mxfp4_quant( + x: torch.Tensor, scaling_mode: str = "even" +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize a tensor to MX FP4 format. + + Args: + x: The input tensor, typically fp16 or bf16. + scaling_mode: The method to calculate MX block scaling. + - "even" (default): `even_round` in `quark.torch.quantization.utils`. + - etc. + Returns: + A tuple of (x_fp4, blockscale_e8m0). + """ + # Assume x is 2D-Tensor for now + M, N = x.shape + + assert (N // 2) % 2 == 0 + + # This is fixed by spec for MXFP4. Do not tune this. + MXFP4_QUANT_BLOCK_SIZE = 32 + x_fp4 = torch.empty((M, N // 2), dtype=torch.uint8, device=x.device) + blockscale_e8m0 = torch.empty( + ((N + MXFP4_QUANT_BLOCK_SIZE - 1) // MXFP4_QUANT_BLOCK_SIZE, M), + dtype=torch.uint8, + device=x.device, + ).T + + # for large N values + if M <= 32: + NUM_ITER = 1 + BLOCK_SIZE_M = triton.next_power_of_2(M) + BLOCK_SIZE_N = 32 + NUM_WARPS = 1 + NUM_STAGES = 1 + else: + NUM_ITER = 4 + BLOCK_SIZE_M = 64 + BLOCK_SIZE_N = 64 + NUM_WARPS = 4 + NUM_STAGES = 2 + + if N <= 16384: + BLOCK_SIZE_M = 32 + BLOCK_SIZE_N = 128 + + # for small N values + if N <= 1024: + NUM_ITER = 1 + NUM_STAGES = 1 + NUM_WARPS = 4 + BLOCK_SIZE_N = min(256, triton.next_power_of_2(N)) + # BLOCK_SIZE_N needs to be multiple of 32 + BLOCK_SIZE_N = max(32, BLOCK_SIZE_N) + BLOCK_SIZE_M = min(8, triton.next_power_of_2(M)) + + grid = ( + triton.cdiv(M, BLOCK_SIZE_M), + triton.cdiv(N, BLOCK_SIZE_N * NUM_ITER), + ) + + _dynamic_mxfp4_quant_kernel[grid]( + x, + x_fp4, + blockscale_e8m0, + *x.stride(), + *x_fp4.stride(), + *blockscale_e8m0.stride(), + M=M, + N=N, + MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, + SCALING_MODE=0, + NUM_ITER=NUM_ITER, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_N=BLOCK_SIZE_N, + NUM_STAGES=NUM_STAGES, + num_warps=NUM_WARPS, + waves_per_eu=0, + num_stages=1, + ) + + return (x_fp4, blockscale_e8m0) diff --git a/aiter/ops/triton/rmsnorm.py b/aiter/ops/triton/rmsnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..cbbaf66ead15156548896320b23aed0bf65e0004 --- /dev/null +++ b/aiter/ops/triton/rmsnorm.py @@ -0,0 +1,1433 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +from typing import Optional +from aiter.ops.triton.utils.types import get_dtype_max +from aiter.ops.triton.utils.arch_info import get_num_sms + + +def num_programs(x): + return min(x.shape[0], get_num_sms()) + + +def block_size(x): + return min(65536 // x.element_size(), triton.next_power_of_2(x.shape[1])) + + +def use_blocked(x): + return x.shape[1] > block_size(x) + + +def dg_tmp_rows(x): + return x.shape[0] if use_blocked(x) else num_programs(x) + + +@triton.jit +def _per_token_quant( + x, + y_scale_ptr, + row_max, + row_idx, + DTYPE_MAX: tl.constexpr, + scale_ub_ptr=None, + EPS_8BIT: tl.constexpr = 1e-12, + CLAMP_MAX: tl.constexpr = False, + CLAMP_OUT: tl.constexpr = False, +): + """ + #TODO: Add Doc + """ + + if CLAMP_MAX: + ub = tl.load(scale_ub_ptr) + row_max = tl.clamp(row_max, EPS_8BIT, ub) + + scale_out = row_max / DTYPE_MAX + scale_out = tl.where(scale_out == 0, 1.0, scale_out) + + scale_recip = 1 / scale_out + + qx = x * scale_recip + + if CLAMP_OUT: + qx = tl.clamp(qx, -DTYPE_MAX, DTYPE_MAX) + + tl.store(y_scale_ptr + row_idx, scale_out.to(y_scale_ptr.dtype.element_ty)) + + return qx + + +@triton.jit +def _rms_norm_kernel( + # Pointers to matrices + input_ptr, + output_ptr, + g_ptr, + rsigma_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `input_row_stride` is + # how much to increase `input_ptr` by to get the element one row down. + input_row_stride, + output_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + epsilon, + # Meta-parameters + BLOCK_SIZE: tl.constexpr, + USE_BLOCKED: tl.constexpr, + NUM_PRGMS: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call rms_norm function + below. + + Applies Root Mean Square Layer Normalization over a mini-batch of inputs. + + Key parameters: + - Input: The input tensor to be normalized with shape (n_rows, n_cols). + - Output: The output tensor with shape (n_rows, n_cols). + - G: The learnable weights tensor with shape (n_cols, ). + """ + # Map the program id to the first row of input and output it should compute. + row_start = tl.program_id(0) + col_offsets = tl.arange(0, BLOCK_SIZE) + + if USE_BLOCKED: + # Persistent loop for rows + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=1): + row_input_ptr = input_ptr + row_idx * input_row_stride + row_output_ptr = output_ptr + row_idx * output_row_stride + + # Accumulate sum of squares + n_cols_blks = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + sum_squares = 0.0 + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs).to(tl.float32) + sum_squares += tl.sum(x * x, axis=0) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + sum_squares += tl.sum(x * x, axis=0) + + # Compute normalization factor + mean_square = sum_squares / n_cols + norm_factor = tl.rsqrt(mean_square + epsilon) + + # Store rsigma (norm_factor) + tl.store(rsigma_ptr + row_idx, norm_factor) + + # Normalize and write output + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs).to(tl.float32) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs).to(tl.float32) + rms_norm = x * norm_factor * g + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty)) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + input_ptrs = row_input_ptr + cols + x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs, mask=mask, other=0.0).to(tl.float32) + rms_norm = x * norm_factor * g + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty), mask=mask) + + else: + mask = col_offsets < n_cols + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=2): + input_ptrs = input_ptr + row_idx * input_row_stride + col_offsets + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32) + row_norm = row * row + row_norm = tl.sum(row_norm, axis=-1) + norm_factor = tl.math.rsqrt((row_norm / n_cols) + epsilon) + + # Store rsigma (norm_factor) + tl.store(rsigma_ptr + row_idx, norm_factor) + + rms_norm = row * norm_factor * g + + output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets + output_ptrs = tl.multiple_of(output_ptrs, (16,)) + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty), mask=mask) + + +@triton.jit +def _quant_rms_norm_kernel( + # Pointers to matrices + input_ptr, + output_ptr, + x_scale_ptr, + y_scale_ptr, + g_ptr, + # Auxiliary tensor to store intermediate data + aux_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `input_row_stride` is + # how much to increase `input_ptr` by to get the element one row down. + input_row_stride, + output_row_stride, + aux_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + epsilon, + # Optional pointers + scale_ub_ptr, # Pointer to the scale upper bound tensor + out_intermediate_ptr, # Pointer to the intermediate output tensor + # Dtype max for quantization + DTYPE_MAX: tl.constexpr, + # Meta-parameters + IS_SMOOTH: tl.constexpr, + CLAMP_MAX: tl.constexpr, + CLAMP_OUT: tl.constexpr, + DUMP_INTERMEDIATE: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + USE_BLOCKED: tl.constexpr, + NUM_PRGMS: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call rmsnorm2d_fwd_with_smoothquant or + rmsnorm2d_fwd_with_dynamicquant functions below. + + Applies Root Mean Square Layer Normalization over a mini-batch of inputs and quantizes the result. + + Key parameters: + - Input: The input tensor to be normalized with shape (n_rows, n_cols). + - Output: The output tensor with shape (n_rows, n_cols). + - X_scale: The tensor to be multiplied by the RMSNorm output if IS_SMOOTH is true, with shape (n_cols, ). + - Y_scale: The tensor where the scale for each row will be stored with shape (n_rows, ). + - G: The learnable weights tensor with shape (n_cols, ). + """ + # Map the program id to the first row of input and output it should compute. + row_start = tl.program_id(0) + col_offsets = tl.arange(0, BLOCK_SIZE) + + if USE_BLOCKED: + # Persistent loop for rows + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=1): + row_input_ptr = input_ptr + row_idx * input_row_stride + row_output_ptr = output_ptr + row_idx * output_row_stride + row_aux_ptr = aux_ptr + row_idx * aux_row_stride + + # Accumulate sum of squares + n_cols_blks = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + sum_squares = 0.0 + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs).to(tl.float32) + sum_squares += tl.sum(x * x, axis=0) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + sum_squares += tl.sum(x * x, axis=0) + + # Compute normalization factor + mean_square = sum_squares / n_cols + norm_factor = tl.rsqrt(mean_square + epsilon) + + row_max = 0.0 + + # Normalize and write output temporarily as fp32 + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs).to(tl.float32) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs).to(tl.float32) + rms_norm = x * norm_factor * g + + if DUMP_INTERMEDIATE: + tl.store( + out_intermediate_ptr + row_idx * n_cols + cols, + rms_norm.to(out_intermediate_ptr.type.element_ty), + ) + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + cols + x_scale_ptrs = tl.multiple_of(x_scale_ptrs, (16,)) + x_scale = tl.load(x_scale_ptrs) + rms_norm *= x_scale + + blk_max = tl.max(tl.abs(rms_norm), axis=-1) + row_max = max(row_max, blk_max) + + aux_ptrs = row_aux_ptr + cols + tl.store(aux_ptrs, rms_norm) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + input_ptrs = row_input_ptr + cols + x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs, mask=mask, other=0.0).to(tl.float32) + rms_norm = x * norm_factor * g + + if DUMP_INTERMEDIATE: + tl.store( + out_intermediate_ptr + row_idx * n_cols + cols, + rms_norm.to(out_intermediate_ptr.type.element_ty), + mask=mask, + ) + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + cols + x_scale = tl.load( + x_scale_ptrs, mask=mask, other=0.0, cache_modifier=".cg" + ) + rms_norm *= x_scale + + blk_max = tl.max(tl.abs(rms_norm), axis=-1) + row_max = max(row_max, blk_max) + + aux_ptrs = row_aux_ptr + cols + tl.store(aux_ptrs, rms_norm, mask=mask) + + # Apply quantization and write output + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + aux_ptrs = row_aux_ptr + cols + aux_ptrs = tl.multiple_of(aux_ptrs, (16,)) + aux = tl.load(aux_ptrs) + + output = _per_token_quant( + aux, + y_scale_ptr, + row_max, + row_idx, + DTYPE_MAX, + scale_ub_ptr=scale_ub_ptr, + CLAMP_MAX=CLAMP_MAX, + CLAMP_OUT=CLAMP_OUT, + ) + + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, output.to(output_ptr.dtype.element_ty)) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + aux_ptrs = row_aux_ptr + cols + aux = tl.load(aux_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + + output = _per_token_quant( + aux, + y_scale_ptr, + row_max, + row_idx, + DTYPE_MAX, + scale_ub_ptr=scale_ub_ptr, + CLAMP_MAX=CLAMP_MAX, + CLAMP_OUT=CLAMP_OUT, + ) + + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, output.to(output_ptr.dtype.element_ty), mask=mask) + else: + mask = col_offsets < n_cols + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=2): + input_ptrs = input_ptr + row_idx * input_row_stride + col_offsets + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32) + row_norm = row * row + row_norm = tl.sum(row_norm, axis=-1) + norm_factor = tl.math.rsqrt((row_norm / n_cols) + epsilon) + + rms_norm = row * norm_factor * g + + if DUMP_INTERMEDIATE: + tl.store( + out_intermediate_ptr + row_idx * n_cols + col_offsets, + rms_norm.to(out_intermediate_ptr.type.element_ty), + mask=mask, + ) + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + col_offsets + x_scale_ptrs = tl.multiple_of(x_scale_ptrs, (16,)) + x_scale = tl.load( + x_scale_ptrs, mask=mask, other=0.0, cache_modifier=".cg" + ) + rms_norm *= x_scale + + row_max = tl.max(tl.abs(rms_norm), axis=-1) + rms_norm = _per_token_quant( + rms_norm, + y_scale_ptr, + row_max, + row_idx, + DTYPE_MAX, + scale_ub_ptr=scale_ub_ptr, + CLAMP_MAX=CLAMP_MAX, + CLAMP_OUT=CLAMP_OUT, + ) + + output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets + output_ptrs = tl.multiple_of(output_ptrs, (16,)) + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty), mask=mask) + + +@triton.jit +def _fused_add_rmsnorm_kernel( + # Pointers to matrices + input_ptr, + output_ptr, + res_in_ptr, + res_out_ptr, + g_ptr, + rsigma_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `input_row_stride` is + # how much to increase `input_ptr` by to get the element one row down. + input_row_stride, + output_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + epsilon, + # Meta-parameters + BLOCK_SIZE: tl.constexpr, + USE_BLOCKED: tl.constexpr, + NUM_PRGMS: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call + rmsnorm2d_fwd_with_add function below. + + Performs an addition between two inputs and then applies Root Mean Square Layer Normalization over + the addition result. + + Key parameters: + - Input: The input tensor to be normalized with shape (n_rows, n_cols). + - Output: The output tensor with shape (n_rows, n_cols). + - Res_in: The tensor to be added to the Input tensor with shape (n_rows, n_cols). + - Res_out: The tensor in which the addition result will be stored with shape (n_rows, n_cols). + - G: The learnable weights tensor with shape (n_cols, ). + """ + # Map the program id to the first row of input and output it should compute. + row_start = tl.program_id(0) + col_offsets = tl.arange(0, BLOCK_SIZE) + + if USE_BLOCKED: + # Persistent loop for rows + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=1): + row_input_ptr = input_ptr + row_idx * input_row_stride + row_output_ptr = output_ptr + row_idx * output_row_stride + row_res_in_ptr = res_in_ptr + row_idx * input_row_stride + row_res_out_ptr = res_out_ptr + row_idx * input_row_stride + + # Accumulate sum of squares + n_cols_blks = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + sum_squares = 0.0 + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs) + res_in_ptrs = row_res_in_ptr + cols + res_in_ptrs = tl.multiple_of(res_in_ptrs, (16,)) + res_in = tl.load(res_in_ptrs) + x += res_in + # Stores residual_out + res_out_ptrs = row_res_out_ptr + cols + tl.store(res_out_ptrs, x.to(res_out_ptr.type.element_ty)) + + x = x.to(tl.float32) + sum_squares += tl.sum(x * x, axis=0) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + res_in_ptrs = row_res_in_ptr + cols + res_in_ptrs = tl.multiple_of(res_in_ptrs, (16,)) + res_in = tl.load(res_in_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + x += res_in + # Stores residual_out + res_out_ptrs = row_res_out_ptr + cols + tl.store(res_out_ptrs, x.to(res_out_ptr.type.element_ty), mask=mask) + + x = x.to(tl.float32) + sum_squares += tl.sum(x * x, axis=0) + + # Compute normalization factor + mean_square = sum_squares / n_cols + norm_factor = tl.rsqrt(mean_square + epsilon) + + # Store rsigma (norm_factor) + tl.store(rsigma_ptr + row_idx, norm_factor) + + # Normalize and write output + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + res_out_ptrs = row_res_out_ptr + cols + res_out_ptrs = tl.multiple_of(res_out_ptrs, (16,)) + x = tl.load(res_out_ptrs).to(tl.float32) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs).to(tl.float32) + rms_norm = x * norm_factor * g + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty)) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + res_out_ptrs = row_res_out_ptr + cols + x = tl.load(res_out_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs, mask=mask, other=0.0).to(tl.float32) + rms_norm = x * norm_factor * g + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty), mask=mask) + + else: + mask = col_offsets < n_cols + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=2): + input_ptrs = input_ptr + row_idx * input_row_stride + col_offsets + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + res_in_ptrs = res_in_ptr + row_idx * input_row_stride + col_offsets + res_in_ptrs = tl.multiple_of(res_in_ptrs, (16,)) + res_in = tl.load(res_in_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + row += res_in + # Stores residual_out + res_out_ptrs = res_out_ptr + row_idx * input_row_stride + col_offsets + res_out_ptrs = tl.multiple_of(res_out_ptrs, (16,)) + tl.store(res_out_ptrs, row.to(res_out_ptr.type.element_ty), mask=mask) + row = row.to(tl.float32) + + g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32) + row_norm = row * row + row_norm = tl.sum(row_norm, axis=-1) + norm_factor = tl.math.rsqrt((row_norm / n_cols) + epsilon) + + # Store rsigma (norm_factor) + tl.store(rsigma_ptr + row_idx, norm_factor) + + rms_norm = row * norm_factor * g + + output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets + output_ptrs = tl.multiple_of(output_ptrs, (16,)) + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty), mask=mask) + + +@triton.jit +def _quant_fused_add_rmsnorm_kernel( + # Pointers to matrices + input_ptr, + output_ptr, + res_in_ptr, + res_out_ptr, + x_scale_ptr, + y_scale_ptr, + g_ptr, + # Auxiliary tensor to store intermediate data + aux_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `input_row_stride` is + # how much to increase `input_ptr` by to get the element one row down. + input_row_stride, + output_row_stride, + aux_row_stride, + # Matrix dimensions + n_rows, + n_cols, + # Epsilon to avoid division by zero + epsilon, + # Dtype max for quantization + DTYPE_MAX: tl.constexpr, + # Meta-parameters + IS_SMOOTH: tl.constexpr, + # Meta-parameters + BLOCK_SIZE: tl.constexpr, + USE_BLOCKED: tl.constexpr, + NUM_PRGMS: tl.constexpr, +): + """ + Note: this is Triton jited function and not meant to be called directly. Call + rmsnorm2d_fwd_with_add_smoothquant or rmsnorm2d_fwd_with_add_dynamicquant functions below. + + Performs an addition between two inputs and then applies Root Mean Square Layer Normalization over + the addition result followed by a quantization. + + Key parameters: + - Input: The input tensor to be normalized with shape (n_rows, n_cols). + - Output: The output tensor with shape (n_rows, n_cols). + - Res_in: The tensor to be added to the Input tensor with shape (n_rows, n_cols). + - Res_out: The tensor in which the addition result will be stored with shape (n_rows, n_cols). + - X_scale: The tensor to be multiplied by the RMSNorm output if IS_SMOOTH is true, with shape (n_cols, ). + - Y_scale: The tensor where the scale for each row will be stored with shape (n_rows, ). + - G: The learnable weights tensor with shape (n_cols, ). + """ + # Map the program id to the first row of input and output it should compute. + row_start = tl.program_id(0) + col_offsets = tl.arange(0, BLOCK_SIZE) + + if USE_BLOCKED: + # Persistent loop for rows + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=1): + row_input_ptr = input_ptr + row_idx * input_row_stride + row_output_ptr = output_ptr + row_idx * output_row_stride + row_res_in_ptr = res_in_ptr + row_idx * input_row_stride + row_res_out_ptr = res_out_ptr + row_idx * input_row_stride + row_aux_ptr = aux_ptr + row_idx * aux_row_stride + + # Accumulate sum of squares + n_cols_blks = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + sum_squares = 0.0 + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs) + res_in_ptrs = row_res_in_ptr + cols + res_in_ptrs = tl.multiple_of(res_in_ptrs, (16,)) + res_in = tl.load(res_in_ptrs) + x += res_in + # Stores residual_out + res_out_ptrs = row_res_out_ptr + cols + tl.store(res_out_ptrs, x.to(res_out_ptr.type.element_ty)) + + x = x.to(tl.float32) + sum_squares += tl.sum(x * x, axis=0) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + input_ptrs = row_input_ptr + cols + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + res_in_ptrs = row_res_in_ptr + cols + res_in_ptrs = tl.multiple_of(res_in_ptrs, (16,)) + res_in = tl.load(res_in_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + x += res_in + # Stores residual_out + res_out_ptrs = row_res_out_ptr + cols + tl.store(res_out_ptrs, x.to(res_out_ptr.type.element_ty), mask=mask) + + x = x.to(tl.float32) + sum_squares += tl.sum(x * x, axis=0) + + # Compute normalization factor + mean_square = sum_squares / n_cols + norm_factor = tl.rsqrt(mean_square + epsilon) + + row_max = 0.0 + + # Normalize and write output temporarily as fp32 + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + res_out_ptrs = row_res_out_ptr + cols + res_out_ptrs = tl.multiple_of(res_out_ptrs, (16,)) + x = tl.load(res_out_ptrs).to(tl.float32) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs).to(tl.float32) + rms_norm = x * norm_factor * g + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + cols + x_scale_ptrs = tl.multiple_of(x_scale_ptrs, (16,)) + x_scale = tl.load(x_scale_ptrs) + rms_norm *= x_scale + + blk_max = tl.max(tl.abs(rms_norm), axis=-1) + row_max = max(row_max, blk_max) + + aux_ptrs = row_aux_ptr + cols + tl.store(aux_ptrs, rms_norm) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + res_out_ptrs = row_res_out_ptr + cols + x = tl.load(res_out_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs, mask=mask, other=0.0).to(tl.float32) + rms_norm = x * norm_factor * g + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + cols + x_scale = tl.load( + x_scale_ptrs, mask=mask, other=0.0, cache_modifier=".cg" + ) + rms_norm *= x_scale + + blk_max = tl.max(tl.abs(rms_norm), axis=-1) + row_max = max(row_max, blk_max) + + aux_ptrs = row_aux_ptr + cols + tl.store(aux_ptrs, rms_norm, mask=mask) + + # Apply quantization and write output + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + aux_ptrs = row_aux_ptr + cols + aux_ptrs = tl.multiple_of(aux_ptrs, (16,)) + aux = tl.load(aux_ptrs) + + output = _per_token_quant( + aux, + y_scale_ptr, + row_max, + row_idx, + DTYPE_MAX, + ) + + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, output.to(output_ptr.dtype.element_ty)) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + aux_ptrs = row_aux_ptr + cols + aux = tl.load(aux_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + + output = _per_token_quant( + aux, + y_scale_ptr, + row_max, + row_idx, + DTYPE_MAX, + ) + + output_ptrs = row_output_ptr + cols + tl.store(output_ptrs, output.to(output_ptr.dtype.element_ty), mask=mask) + + else: + mask = col_offsets < n_cols + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=2): + input_ptrs = input_ptr + row_idx * input_row_stride + col_offsets + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + res_in_ptrs = res_in_ptr + row_idx * input_row_stride + col_offsets + res_in_ptrs = tl.multiple_of(res_in_ptrs, (16,)) + res_in = tl.load(res_in_ptrs, mask=mask, other=0.0, cache_modifier=".cg") + row += res_in + # Stores residual_out + res_out_ptrs = res_out_ptr + row_idx * input_row_stride + col_offsets + res_out_ptrs = tl.multiple_of(res_out_ptrs, (16,)) + tl.store(res_out_ptrs, row.to(res_out_ptr.type.element_ty), mask=mask) + row = row.to(tl.float32) + + g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32) + row_norm = row * row + row_norm = tl.sum(row_norm, axis=-1) + norm_factor = tl.math.rsqrt((row_norm / n_cols) + epsilon) + + rms_norm = row * norm_factor * g + + if IS_SMOOTH: + x_scale_ptrs = x_scale_ptr + col_offsets + x_scale_ptrs = tl.multiple_of(x_scale_ptrs, (16,)) + x_scale = tl.load( + x_scale_ptrs, mask=mask, other=0.0, cache_modifier=".cg" + ) + rms_norm *= x_scale + + row_max = tl.max(tl.abs(rms_norm), axis=-1) + rms_norm = _per_token_quant( + rms_norm, + y_scale_ptr, + row_max, + row_idx, + DTYPE_MAX, + ) + + output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets + output_ptrs = tl.multiple_of(output_ptrs, (16,)) + tl.store(output_ptrs, rms_norm.to(output_ptr.type.element_ty), mask=mask) + + +@triton.jit +def _rmsnorm_bwd_triton( + grad_output_ptr, + input_ptr, + g_ptr, + rsigma_ptr, + dx_ptr, + dg_ptr, + input_row_stride, + output_row_stride, + n_rows, + n_cols, + BLOCK_SIZE: tl.constexpr, + USE_BLOCKED: tl.constexpr, + NUM_PRGMS: tl.constexpr, +): + row_start = tl.program_id(0) + col_offsets = tl.arange(0, BLOCK_SIZE) + + if USE_BLOCKED: + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=1): + row_input_ptr = input_ptr + row_idx * input_row_stride + row_grad_output_ptr = grad_output_ptr + row_idx * output_row_stride + row_dx_ptr = dx_ptr + row_idx * input_row_stride + row_dg_ptr = dg_ptr + row_idx * input_row_stride + + # Compute gradients sum of all colums for each row + n_cols_blks = tl.cdiv(n_cols, BLOCK_SIZE) - 1 + # older version of triton doesn't accept below init + # comment out for now to make it compatible with triton 3.1 + # grad_sum: tl.float32 = 0.0 + grad_sum = 0.0 + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + grad_output_ptrs = row_grad_output_ptr + cols + + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16,)) + + x = tl.load(input_ptrs).to(tl.float32) + grad_output = tl.load(grad_output_ptrs).to(tl.float32) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs).to(tl.float32) + grad_sum += tl.sum(grad_output * x * g, axis=0) + + # remainder for grad_sum: + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + input_ptrs = row_input_ptr + cols + x = tl.load(input_ptrs, mask=mask, other=0.0).to(tl.float32) + grad_output_ptrs = row_grad_output_ptr + cols + grad_output = tl.load(grad_output_ptrs, mask=mask, other=0.0).to(tl.float32) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs, mask=mask, other=0.0).to(tl.float32) + grad_sum += tl.sum(grad_output * x * g, axis=0) + + # Load r_sigma + norm_factor = tl.load(rsigma_ptr + row_idx).to(tl.float32) + + for blk_idx in tl.range(0, n_cols_blks, num_stages=2): + cols = blk_idx * BLOCK_SIZE + col_offsets + input_ptrs = row_input_ptr + cols + grad_output_ptrs = row_grad_output_ptr + cols + + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16,)) + + x = tl.load(input_ptrs).to(tl.float32) + grad_output = tl.load(grad_output_ptrs).to(tl.float32) + + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs).to(tl.float32) + grad_input = grad_output * norm_factor * g - ( + norm_factor * norm_factor * norm_factor + ) * x * (grad_sum / n_cols) + + dx_ptrs = row_dx_ptr + cols + tl.store(dx_ptrs, grad_input.to(dx_ptr.type.element_ty)) + + dg = grad_output * x * norm_factor + dg_ptrs = row_dg_ptr + cols + tl.store(dg_ptrs, dg.to(tl.float32)) + + # Handle remainder + cols = n_cols_blks * BLOCK_SIZE + col_offsets + mask = cols < n_cols + + input_ptrs = row_input_ptr + cols + x = tl.load(input_ptrs, mask=mask, other=0.0).to(tl.float32) + grad_output_ptrs = row_grad_output_ptr + cols + grad_output = tl.load(grad_output_ptrs, mask=mask, other=0.0).to(tl.float32) + g_ptrs = g_ptr + cols + g = tl.load(g_ptrs, mask=mask, other=0.0).to(tl.float32) + grad_input = grad_output * norm_factor * g - ( + norm_factor * norm_factor * norm_factor + ) * x * (grad_sum / n_cols) + + dx_ptrs = row_dx_ptr + cols + tl.store(dx_ptrs, grad_input.to(dx_ptr.type.element_ty), mask=mask) + + dg = grad_output * x * norm_factor + dg_ptrs = row_dg_ptr + cols + tl.store(dg_ptrs, dg.to(tl.float32), mask=mask) + + else: + mask = col_offsets < n_cols + dg_col_redux = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + + for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=2): + input_ptrs = input_ptr + row_idx * input_row_stride + col_offsets + grad_output_ptrs = ( + grad_output_ptr + row_idx * output_row_stride + col_offsets + ) + dx_ptrs = dx_ptr + row_idx * input_row_stride + col_offsets + + input_ptrs = tl.multiple_of(input_ptrs, (16,)) + grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16,)) + dx_ptrs = tl.multiple_of(dx_ptrs, (16,)) + + x = tl.load(input_ptrs, mask=mask, other=0.0).to(tl.float32) + grad_output = tl.load(grad_output_ptrs, mask=mask, other=0.0).to(tl.float32) + g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32) + + norm_factor = tl.load(rsigma_ptr + row_idx).to(tl.float32) + grad_sum = tl.sum(grad_output * x * g, axis=0) + + grad_input = grad_output * norm_factor * g - ( + norm_factor * norm_factor * norm_factor + ) * x * (grad_sum / n_cols) + tl.store(dx_ptrs, grad_input.to(dx_ptr.type.element_ty), mask=mask) + + dg = grad_output * x * norm_factor + dg_col_redux += dg.to(tl.float32) + + tl.store( + dg_ptr + tl.program_id(0) * input_row_stride + col_offsets, + dg_col_redux, + mask=mask, + ) + + +@triton.jit +def _rmsnorm_bwd_dg_reduce_triton( + dg_in_ptr, + dg_out_ptr, + dg_in_stride, + n_rows, + n_cols, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, +): + # we want parallelism in N direction + # if N is small, we will just use one CU, + # otherwise, it can be split by N/BLOCK_SIZE + pid = tl.program_id(0) + cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for i in range(0, n_rows, BLOCK_SIZE_M): + rows = i + tl.arange(0, BLOCK_SIZE_M) + mask = (rows[:, None] < n_rows) & (cols[None, :] < n_cols) + offs = rows[:, None] * n_cols + cols[None, :] + acc += tl.load(dg_in_ptr + offs, mask=mask, other=0.0, cache_modifier=".cg").to( + tl.float32 + ) + + sum_dg = tl.sum(acc, axis=0) + tl.store( + dg_out_ptr + cols, sum_dg.to(dg_out_ptr.type.element_ty), mask=cols < n_cols + ) + + +def _rmsnorm_forward(x: torch.Tensor, weight: torch.Tensor, epsilon: float): + + n_rows, n_cols = x.shape + + y = torch.empty_like(x) + rsigma = torch.empty((n_rows,), dtype=torch.float32, device=x.device) + + blk_size = block_size(x) + USE_BLOCKED = use_blocked(x) + NUM_PRGMS = num_programs(x) + + grid = lambda meta: (NUM_PRGMS,) # noqa: E731 + _rms_norm_kernel[grid]( + x, + y, + weight, + rsigma, + x.stride(0), + y.stride(0), + n_rows, + n_cols, + epsilon, + blk_size, + USE_BLOCKED, + NUM_PRGMS, + ) + + return y, rsigma + + +def _rmsnorm_forward_with_add( + out: torch.Tensor, + x: torch.Tensor, + residual_in: torch.Tensor, + residual_out: torch.Tensor, + weight: torch.Tensor, + rsigma: torch.Tensor, + epsilon: float, +): + + n_rows, n_cols = x.shape + + blk_size = block_size(x) + USE_BLOCKED = use_blocked(x) + NUM_PRGMS = num_programs(x) + + grid = lambda meta: (NUM_PRGMS,) # noqa: E731 + _fused_add_rmsnorm_kernel[grid]( + x, + out, + residual_in, + residual_out, + weight, + rsigma, + x.stride(0), + out.stride(0), + n_rows, + n_cols, + epsilon, + blk_size, + USE_BLOCKED, + NUM_PRGMS, + ) + + +def _rmsnorm_backward(dz, x, gamma, rsigma): + dz_ = dz.contiguous() + x_ = x.contiguous() + gamma_ = gamma.contiguous() + rsigma_ = rsigma.contiguous() + + dx = torch.empty_like(x_) + dgamma = torch.empty_like(gamma_) + + M, N = x_.shape + blk_size = block_size(x_) + USE_BLOCKED = use_blocked(x_) + NUM_PRGMS = num_programs(x_) + need_reduction = N > 1 + + dg_tmp = ( + torch.empty( + dg_tmp_rows(x_), N, device="cuda", dtype=torch.float32, requires_grad=False + ) + if need_reduction + else None + ) + + grid_bwd = lambda meta: (NUM_PRGMS,) # noqa: E731 + _rmsnorm_bwd_triton[grid_bwd]( + dz_, + x_, + gamma_, + rsigma_, + dx, + dg_tmp if need_reduction else dgamma, + x_.stride(0), + dz_.stride(0), + M, + N, + blk_size, + USE_BLOCKED, + NUM_PRGMS, + num_warps=8, + ) + + if need_reduction: + grid_reduce = lambda meta: [triton.cdiv(N, meta["BLOCK_SIZE_N"])] # noqa: E731 + _rmsnorm_bwd_dg_reduce_triton[grid_reduce]( + dg_tmp, + dgamma, + dg_tmp.stride(0), + dg_tmp.shape[0], + dg_tmp.shape[1], + BLOCK_SIZE_M=128, + BLOCK_SIZE_N=64, + ) + + return dx, dgamma + + +class _RMSNorm(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, weight, epsilon, is_grad_enabled): + + is_grad = is_grad_enabled and any( + tensor.requires_grad for tensor in [x, weight] + ) + + y, rsigma = _rmsnorm_forward(x, weight, epsilon) + + if is_grad: + ctx.save_for_backward(x, weight, rsigma) + + return y + + @staticmethod + def backward(ctx, grad_output): + x, weight, rsigma = ctx.saved_tensors + + dx, dg = _rmsnorm_backward(grad_output, x, weight, rsigma) + + return dx, dg, None, None + + +class _RMSNorm2dFwdWithAdd(torch.autograd.Function): + + @staticmethod + def forward(ctx, y, x, res_in, res_out, weight, epsilon, is_grad_enabled): + + is_grad = is_grad_enabled and any( + tensor.requires_grad for tensor in [x, weight] + ) + + M = x.shape[0] + rsigma = torch.empty((M,), dtype=torch.float32, device=x.device) + + _rmsnorm_forward_with_add(y, x, res_in, res_out, weight, rsigma, epsilon) + + if is_grad: + ctx.save_for_backward(res_out, weight, rsigma) + + return y + + @staticmethod + def backward(ctx, grad_output): + x, weight, rsigma = ctx.saved_tensors + + dx, dg = _rmsnorm_backward(grad_output, x, weight, rsigma) + + return None, dx, None, None, dg, None, None + + +def rms_norm(input: torch.Tensor, weight: torch.Tensor, epsilon: float): + """ + Applies Root Mean Square Layer Normalization over a mini-batch of inputs. + + Key parameters: + - Input: The input tensor to be normalized with shape (M, N). + - Weight: The learnable weights tensor with shape (N, ). + - Epsilon: A value added to the denominator for numerical stability. + + Returns: + - Output: The output tensor with shape (M, N). + """ + return _RMSNorm.apply(input, weight, epsilon, torch.is_grad_enabled()) + + +def rmsnorm2d_fwd_with_add( + out: torch.Tensor, + input: torch.Tensor, + residual_in: torch.Tensor, + residual_out: torch.Tensor, + weight: torch.Tensor, + epsilon: float, +): + """ + Performs an addition between two inputs and then applies Root Mean Square Layer Normalization over + the addition result. + + Key parameters: + - Out: The tensor where the output will be stored with shape (M, N). + - Input: The input tensor to be normalized with shape (M, N). + - Residual_in: The tensor to be added to the Input tensor with shape (M, N). + - Residual_out: The tensor in which the addition result will be stored with shape (M, N). + - Weight: The learnable weights tensor with shape (N, ). + - Epsilon: A value added to the denominator for numerical stability. + + Returns: + - Output: The output tensor with shape (M, N). + """ + return _RMSNorm2dFwdWithAdd.apply( + out, input, residual_in, residual_out, weight, epsilon, torch.is_grad_enabled() + ) + + +def rmsnorm2d_fwd_with_smoothquant( + out: torch.Tensor, + input: torch.Tensor, + xscale: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + epsilon: float, +): + """ + Applies Root Mean Square Layer Normalization over a mini-batch of inputs and quantizes the result. + + Key parameters: + - Out: The tensor where the output will be stored with shape (M, N). + - Input: The input tensor to be normalized with shape (M, N). + - Xscale: The tensor to be multiplied by the RMSNorm output, with shape (N, ). + - Yscale: The tensor where the scale for each row will be stored with shape (M, ). + - Weight: The learnable weights tensor with shape (N, ). + - Epsilon: A value added to the denominator for numerical stability. + """ + n_rows, n_cols = input.shape + + blk_size = block_size(input) + USE_BLOCKED = use_blocked(input) + NUM_PRGMS = num_programs(input) + + IS_SMOOTH = True + DTYPE_MAX = get_dtype_max(out.dtype) + + scale_ub = None + out_rmsnorm = None + CLAMP_MAX = False + clamp_out = False + dump_rms_norm = False + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = None + if USE_BLOCKED: + aux = torch.empty(n_rows, n_cols, dtype=torch.float32, device=input.device) + + grid = lambda meta: (NUM_PRGMS,) # noqa: E731 + _quant_rms_norm_kernel[grid]( + input, + out, + xscale, + yscale, + weight, + aux, + input.stride(0), + out.stride(0), + aux.stride(0) if USE_BLOCKED else None, + n_rows, + n_cols, + epsilon, + scale_ub, + out_rmsnorm, + DTYPE_MAX, + IS_SMOOTH, + CLAMP_MAX, + clamp_out, + dump_rms_norm, + blk_size, + USE_BLOCKED, + NUM_PRGMS, + ) + + +def rmsnorm2d_fwd_with_dynamicquant( + out: torch.Tensor, + input: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + scale_ub: Optional[torch.Tensor] = None, + clamp_out: bool = False, + dump_rms_norm: bool = False, +): + """ + Applies Root Mean Square Layer Normalization over a mini-batch of inputs and quantizes the result. + + Key parameters: + - Out: The tensor where the output will be stored with shape (M, N). + - Input: The input tensor to be normalized with shape (M, N). + - Yscale: The tensor where the scale for each row will be stored with shape (M, ). + - Weight: The learnable weights tensor with shape (N, ). + - Epsilon: A value added to the denominator for numerical stability. + """ + n_rows, n_cols = input.shape + + blk_size = block_size(input) + USE_BLOCKED = use_blocked(input) + NUM_PRGMS = num_programs(input) + + xscale = None + IS_SMOOTH = False + DTYPE_MAX = get_dtype_max(out.dtype) + CLAMP_MAX = scale_ub is not None + + out_rms_norm = None + if dump_rms_norm: + out_rms_norm = torch.empty_like(input) + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = None + if USE_BLOCKED: + aux = torch.empty(n_rows, n_cols, dtype=torch.float32, device=input.device) + + grid = lambda meta: (NUM_PRGMS,) # noqa: E731 + _quant_rms_norm_kernel[grid]( + input, + out, + xscale, + yscale, + weight, + aux, + input.stride(0), + out.stride(0), + aux.stride(0) if USE_BLOCKED else None, + n_rows, + n_cols, + epsilon, + scale_ub, + out_rms_norm, + DTYPE_MAX, + IS_SMOOTH, + CLAMP_MAX, + clamp_out, + dump_rms_norm, + blk_size, + USE_BLOCKED, + NUM_PRGMS, + ) + + return out_rms_norm + + +def rmsnorm2d_fwd_with_add_smoothquant( + out: torch.Tensor, + input: torch.Tensor, + residual_in: torch.Tensor, + residual_out: torch.Tensor, + xscale: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + epsilon: float, +): + """ + Performs an addition between two inputs and then applies Root Mean Square Layer Normalization over + the addition result followed by a quantization. + + Key parameters: + - Out: The tensor where the output will be stored with shape (M, N). + - Input: The input tensor to be normalized with shape (M, N). + - Residual_in: The tensor to be added to the Input tensor with shape (M, N). + - Residual_out: The tensor in which the addition result will be stored with shape (M, N). + - Xscale: The tensor to be multiplied by the RMSNorm output, with shape (N, ). + - Yscale: The tensor where the scale for each row will be stored with shape (M, ). + - Weight: The learnable weights tensor with shape (N, ). + - Epsilon: A value added to the denominator for numerical stability. + """ + n_rows, n_cols = input.shape + + blk_size = block_size(input) + USE_BLOCKED = use_blocked(input) + NUM_PRGMS = num_programs(input) + + IS_SMOOTH = True + DTYPE_MAX = get_dtype_max(out.dtype) + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = None + if USE_BLOCKED: + aux = torch.empty(n_rows, n_cols, dtype=torch.float32, device=input.device) + + grid = lambda meta: (NUM_PRGMS,) # noqa: E731 + _quant_fused_add_rmsnorm_kernel[grid]( + input, + out, + residual_in, + residual_out, + xscale, + yscale, + weight, + aux, + input.stride(0), + out.stride(0), + aux.stride(0) if USE_BLOCKED else None, + n_rows, + n_cols, + epsilon, + DTYPE_MAX, + IS_SMOOTH, + blk_size, + USE_BLOCKED, + NUM_PRGMS, + ) + + +def rmsnorm2d_fwd_with_add_dynamicquant( + out: torch.Tensor, + input: torch.Tensor, + residual_in: torch.Tensor, + residual_out: torch.Tensor, + yscale: torch.Tensor, + weight: torch.Tensor, + epsilon: float, +): + """ + Performs an addition between two inputs and then applies Root Mean Square Layer Normalization over + the addition result followed by a quantization. + + Key parameters: + - Out: The tensor where the output will be stored with shape (M, N). + - Input: The input tensor to be normalized with shape (M, N). + - Residual_in: The tensor to be added to the Input tensor with shape (M, N). + - Residual_out: The tensor in which the addition result will be stored with shape (M, N). + - Yscale: The tensor where the scale for each row will be stored with shape (M, ). + - Weight: The learnable weights tensor with shape (N, ). + - Epsilon: A value added to the denominator for numerical stability. + """ + n_rows, n_cols = input.shape + + blk_size = block_size(input) + USE_BLOCKED = use_blocked(input) + NUM_PRGMS = num_programs(input) + + xscale = None + IS_SMOOTH = False + DTYPE_MAX = get_dtype_max(out.dtype) + + # Auxiliary tensor to store the RMSNorm output as fp32 before applying the quantization when using the blocked approach + aux = None + if USE_BLOCKED: + aux = torch.empty(n_rows, n_cols, dtype=torch.float32, device=input.device) + + grid = lambda meta: (NUM_PRGMS,) # noqa: E731 + _quant_fused_add_rmsnorm_kernel[grid]( + input, + out, + residual_in, + residual_out, + xscale, + yscale, + weight, + aux, + input.stride(0), + out.stride(0), + aux.stride(0) if USE_BLOCKED else None, + n_rows, + n_cols, + epsilon, + DTYPE_MAX, + IS_SMOOTH, + blk_size, + USE_BLOCKED, + NUM_PRGMS, + ) diff --git a/aiter/ops/triton/rope.py b/aiter/ops/triton/rope.py new file mode 100644 index 0000000000000000000000000000000000000000..752e3dad949eb3e11969f745a12f9097cee888ce --- /dev/null +++ b/aiter/ops/triton/rope.py @@ -0,0 +1,3568 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +from torch import autograd +from enum import IntEnum +from typing import Tuple, Union + + +class RotateStyle(IntEnum): + NEOX = (0,) + GPTJ = 1 + + +@triton.jit +def _get_neox_rotated_x_1D( + x, + x_rotated_mask, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + x_rotated = tl.where(x_rotated_mask, x, -x) + x_rotated = tl.reshape(x_rotated, (2, BLOCK_D_HALF)) + x_rotated = tl.flip(x_rotated, 1) + x_rotated = tl.reshape(x_rotated, (BLOCK_D,)) + x_rotated = tl.flip(x_rotated, 0) + return x_rotated + + +@triton.jit +def _get_gptj_rotated_x_1D( + x, + x_rotated_mask, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + x_rotated = tl.where(x_rotated_mask, x, -x) + x_rotated = tl.reshape(x_rotated, (BLOCK_D_HALF, 2)) + x_rotated = tl.flip(x_rotated, 1) + x_rotated = tl.reshape(x_rotated, (BLOCK_D,)) + return x_rotated + + +@triton.jit +def _get_neox_rotated_x( + x, + x_rotated_mask, + BLOCK_T: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, + IS_BWD: tl.constexpr = False, +): + if IS_BWD: + x_rotated = tl.where(x_rotated_mask, -x, x) + else: + x_rotated = tl.where(x_rotated_mask, x, -x) + + x_rotated = tl.reshape(x_rotated, (BLOCK_T, 2, BLOCK_D_HALF)) + x_rotated = tl.flip(x_rotated, 2) + x_rotated = tl.reshape( + x_rotated, + ( + BLOCK_T, + BLOCK_D, + ), + ) + x_rotated = tl.flip(x_rotated, 1) + return x_rotated + + +@triton.jit +def _get_gptj_rotated_x( + x, + x_rotated_mask, + BLOCK_T: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, + IS_BWD: tl.constexpr = False, +): + if IS_BWD: + x_rotated = tl.where(x_rotated_mask, -x, x) + else: + x_rotated = tl.where(x_rotated_mask, x, -x) + + x_rotated = tl.reshape(x_rotated, (BLOCK_T, BLOCK_D_HALF, 2)) + x_rotated = tl.flip(x_rotated, 2) + x_rotated = tl.reshape( + x_rotated, + ( + BLOCK_T, + BLOCK_D, + ), + ) + return x_rotated + + +@triton.jit +def _rope_kernel_sbhd_fwd( + x_ptr, + freqs_ptr, + out_ptr, + stride_x_s, + stride_x_b, + stride_x_h, + stride_x_d, + stride_freqs_s, + stride_freqs_b, + stride_freqs_h, + stride_freqs_d, + stride_out_s, + stride_out_b, + stride_out_h, + stride_out_d, + S, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + BLOCK_S: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + # Parallelize over batch and head. Handle 1 sequence per program + b = tl.program_id(0) + h = tl.program_id(1) + pid_s = tl.program_id(2) + + s_offs = pid_s * BLOCK_S + tl.arange(0, BLOCK_S) + d_offs = tl.arange(0, BLOCK_D) + s_mask = s_offs < S + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_freqs_offs = tl.where( + (d_offs >= BLOCK_D_HALF) & (d_offs < BLOCK_D), + d_offs - BLOCK_D_HALF, + d_offs, + ).to(d_offs.dtype) + d_freqs_mask = d_freqs_offs < BLOCK_D + else: + d_freqs_offs = d_offs // 2 + d_freqs_mask = d_freqs_offs < BLOCK_D_HALF + else: + d_freqs_offs = d_offs + d_freqs_mask = d_freqs_offs < BLOCK_D + + freqs_mask = s_mask[:, None] & d_freqs_mask[None, :] + freqs_offs = ( + s_offs[:, None] * stride_freqs_s + d_freqs_offs[None, :] * stride_freqs_d + ) + + freqs = tl.load(freqs_ptr + freqs_offs, mask=freqs_mask) + cos = tl.cos(freqs.to(tl.float32)) + sin = tl.sin(freqs.to(tl.float32)) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_offs = ( + b * stride_x_b + + s_offs[:, None] * stride_x_s + + h * stride_x_h + + (d_offs + nope_offs)[None, :] * stride_x_d + ) + x_mask = s_mask[:, None] & (d_offs < BLOCK_D)[None, :] + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + x_rotated = _get_neox_rotated_x( + x, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF + ) + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + x_rotated = _get_gptj_rotated_x( + x, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF + ) + + out_x = x * cos + x_rotated * sin + out_x = out_x.to(x_ptr.dtype.element_ty) + x_out_offs = ( + b * stride_out_b + + s_offs[:, None] * stride_out_s + + h * stride_out_h + + (d_offs + nope_offs)[None, :] * stride_out_d + ) + + tl.store(out_ptr + x_out_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs - BLOCK_D * stride_out_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs + BLOCK_D * stride_out_d, x, mask=x_mask) + + +@triton.jit +def _rope_kernel_sbhd_bwd( + x_ptr, + freqs_ptr, + out_ptr, + stride_x_s, + stride_x_b, + stride_x_h, + stride_x_d, + stride_freqs_s, + stride_freqs_b, + stride_freqs_h, + stride_freqs_d, + stride_out_s, + stride_out_b, + stride_out_h, + stride_out_d, + S, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + BLOCK_S: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + # Parallelize over batch and head. Handle 1 sequence per program + b = tl.program_id(0) + h = tl.program_id(1) + pid_s = tl.program_id(2) + + s_offs = pid_s * BLOCK_S + tl.arange(0, BLOCK_S) + d_offs = tl.arange(0, BLOCK_D) + s_mask = s_offs < S + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_freqs_offs = tl.where( + (d_offs >= BLOCK_D_HALF) & (d_offs < BLOCK_D), + d_offs - BLOCK_D_HALF, + d_offs, + ).to(d_offs.dtype) + d_freqs_mask = d_freqs_offs < BLOCK_D + else: + d_freqs_offs = d_offs // 2 + d_freqs_mask = d_freqs_offs < BLOCK_D_HALF + else: + d_freqs_offs = d_offs + d_freqs_mask = d_freqs_offs < BLOCK_D + + freqs_mask = s_mask[:, None] & d_freqs_mask[None, :] + freqs_offs = ( + s_offs[:, None] * stride_freqs_s + d_freqs_offs[None, :] * stride_freqs_d + ) + + freqs = tl.load(freqs_ptr + freqs_offs, mask=freqs_mask) + cos = tl.cos(freqs.to(tl.float32)) + sin = tl.sin(freqs.to(tl.float32)) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_mask = s_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + x_offs = ( + b * stride_x_b + + s_offs[:, None] * stride_x_s + + h * stride_x_h + + d_offs[None, :] * stride_x_d + ) + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x * sin, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF, True + ) + else: + x_rotated = _get_gptj_rotated_x( + x * sin, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF, True + ) + + out_x = x * cos + x_rotated + out_x = out_x.to(x_ptr.dtype.element_ty) + x_out_offs = ( + b * stride_out_b + + s_offs[:, None] * stride_out_s + + h * stride_out_h + + d_offs[None, :] * stride_out_d + ) + + tl.store(out_ptr + x_out_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs - BLOCK_D * stride_out_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs + BLOCK_D * stride_out_d, x, mask=x_mask) + + +@triton.jit +def _rope_kernel_thd_fwd( + x_ptr, + cu_seqlens_ptr, + freqs_ptr, + out_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_freqs_t, + stride_freqs_b, + stride_freqs_h, + stride_freqs_d, + stride_out_t, + stride_out_h, + stride_out_d, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + BLOCK_T: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + b = tl.program_id(0) + h = tl.program_id(1) + pid_t = tl.program_id(2) + + t_start = tl.load(cu_seqlens_ptr + b) + t_end = tl.load(cu_seqlens_ptr + b + 1) + T = t_end - t_start + if pid_t * BLOCK_T >= T: + return + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_freqs_offs = tl.where( + (d_offs >= BLOCK_D_HALF) & (d_offs < BLOCK_D), + d_offs - BLOCK_D_HALF, + d_offs, + ).to(d_offs.dtype) + d_freqs_mask = d_freqs_offs < BLOCK_D + else: + d_freqs_offs = d_offs // 2 + d_freqs_mask = d_freqs_offs < BLOCK_D_HALF + else: + d_freqs_offs = d_offs + d_freqs_mask = d_freqs_offs < BLOCK_D + + freqs_mask = t_mask[:, None] & d_freqs_mask[None, :] + freqs_offs = ( + t_offs[:, None] * stride_freqs_t + d_freqs_offs[None, :] * stride_freqs_d + ) + freqs = tl.load(freqs_ptr + freqs_offs, mask=freqs_mask) + cos = tl.cos(freqs.to(tl.float32)) + sin = tl.sin(freqs.to(tl.float32)) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + x_offs = ( + (t_start + t_offs)[:, None] * stride_x_t + + h * stride_x_h + + d_offs[None, :] * stride_x_d + ) + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + else: + x_rotated = _get_gptj_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + + out_x = x * cos + x_rotated * sin + out_x = out_x.to(x_ptr.dtype.element_ty) + x_out_offs = ( + (t_start + t_offs)[:, None] * stride_out_t + + h * stride_out_h + + d_offs[None, :] * stride_out_d + ) + + tl.store(out_ptr + x_out_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs - BLOCK_D * stride_out_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs + BLOCK_D * stride_out_d, x, mask=x_mask) + + +@triton.jit +def _rope_kernel_thd_bwd( + x_ptr, + cu_seqlens_ptr, + freqs_ptr, + out_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_freqs_t, + stride_freqs_b, + stride_freqs_h, + stride_freqs_d, + stride_out_t, + stride_out_h, + stride_out_d, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + BLOCK_T: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + b = tl.program_id(0) + h = tl.program_id(1) + pid_t = tl.program_id(2) + + t_start = tl.load(cu_seqlens_ptr + b) + t_end = tl.load(cu_seqlens_ptr + b + 1) + T = t_end - t_start + if pid_t * BLOCK_T >= T: + return + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_freqs_offs = tl.where( + (d_offs >= BLOCK_D_HALF) & (d_offs < BLOCK_D), + d_offs - BLOCK_D_HALF, + d_offs, + ).to(d_offs.dtype) + d_freqs_mask = d_freqs_offs < BLOCK_D + else: + d_freqs_offs = d_offs // 2 + d_freqs_mask = d_freqs_offs < BLOCK_D_HALF + else: + d_freqs_offs = d_offs + d_freqs_mask = d_freqs_offs < BLOCK_D + + freqs_mask = t_mask[:, None] & d_freqs_mask[None, :] + freqs_offs = ( + t_offs[:, None] * stride_freqs_t + d_freqs_offs[None, :] * stride_freqs_d + ) + freqs = tl.load(freqs_ptr + freqs_offs, mask=freqs_mask) + cos = tl.cos(freqs.to(tl.float32)) + sin = tl.sin(freqs.to(tl.float32)) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + x_offs = ( + (t_start + t_offs)[:, None] * stride_x_t + + h * stride_x_h + + d_offs[None, :] * stride_x_d + ) + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + else: + x_rotated = _get_gptj_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + + out_x = x * cos + x_rotated + out_x = out_x.to(x_ptr.dtype.element_ty) + x_out_offs = ( + (t_start + t_offs)[:, None] * stride_out_t + + h * stride_out_h + + d_offs[None, :] * stride_out_d + ) + + tl.store(out_ptr + x_out_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs - BLOCK_D * stride_out_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs + BLOCK_D * stride_out_d, x, mask=x_mask) + + +@triton.jit +def _rope_kernel_sbhd_cached_fwd( + x_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_ptr, + stride_x_s, + stride_x_b, + stride_x_h, + stride_x_d, + stride_cos_s, + stride_cos_b, + stride_cos_h, + stride_cos_d, + stride_pos_s, + stride_pos_b, + stride_out_s, + stride_out_b, + stride_out_h, + stride_out_d, + S, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_S: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + b = tl.program_id(0) + h = tl.program_id(1) + pid_s = tl.program_id(2) + + s_offs = pid_s * BLOCK_S + tl.arange(0, BLOCK_S) + d_offs = tl.arange(0, BLOCK_D) + s_mask = s_offs < S + + if HAVE_POS: + pos_offs = s_offs * stride_pos_s + b * stride_pos_b + pos = tl.load(pos_ptr + pos_offs, mask=s_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=s_mask) + s_cos_offs = pos + offset + else: + s_cos_offs = pos + else: + s_cos_offs = s_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs >= BLOCK_D_HALF) & (d_cos_offs < BLOCK_D), + d_cos_offs - BLOCK_D_HALF, + d_cos_offs, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D + else: + d_cos_offs = d_offs // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = s_mask[:, None] & d_cos_mask[None, :] + cos_offs = s_cos_offs[:, None] * stride_cos_s + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_mask = s_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + x_offs = ( + b * stride_x_b + + s_offs[:, None] * stride_x_s + + h * stride_x_h + + d_offs[None, :] * stride_x_d + ) + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF + ) + else: + x_rotated = _get_gptj_rotated_x( + x, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF + ) + + out_x = x * cos + x_rotated * sin + out_x = out_x.to(x_ptr.dtype.element_ty) + x_out_offs = ( + b * stride_out_b + + s_offs[:, None] * stride_out_s + + h * stride_out_h + + d_offs[None, :] * stride_out_d + ) + + tl.store(out_ptr + x_out_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs - BLOCK_D * stride_out_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs + BLOCK_D * stride_out_d, x, mask=x_mask) + + +@triton.jit +def _rope_kernel_sbhd_cached_bwd( + x_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_ptr, + stride_x_s, + stride_x_b, + stride_x_h, + stride_x_d, + stride_cos_s, + stride_cos_b, + stride_cos_h, + stride_cos_d, + stride_pos_s, + stride_pos_b, + stride_out_s, + stride_out_b, + stride_out_h, + stride_out_d, + S, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_S: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + b = tl.program_id(0) + h = tl.program_id(1) + pid_s = tl.program_id(2) + + s_offs = pid_s * BLOCK_S + tl.arange(0, BLOCK_S) + d_offs = tl.arange(0, BLOCK_D) + s_mask = s_offs < S + + if HAVE_POS: + pos_offs = s_offs * stride_pos_s + b * stride_pos_b + pos = tl.load(pos_ptr + pos_offs, mask=s_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=s_mask) + s_cos_offs = pos + offset + else: + s_cos_offs = pos + else: + s_cos_offs = s_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs >= BLOCK_D_HALF) & (d_cos_offs < BLOCK_D), + d_cos_offs - BLOCK_D_HALF, + d_cos_offs, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D + else: + d_cos_offs = d_offs // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = s_mask[:, None] & d_cos_mask[None, :] + cos_offs = s_cos_offs[:, None] * stride_cos_s + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_mask = s_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + x_offs = ( + b * stride_x_b + + s_offs[:, None] * stride_x_s + + h * stride_x_h + + d_offs[None, :] * stride_x_d + ) + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x * sin, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF, True + ) + else: + x_rotated = _get_gptj_rotated_x( + x * sin, x_rotated_mask, BLOCK_S, BLOCK_D, BLOCK_D_HALF, True + ) + + out_x = x * cos + x_rotated + out_x = out_x.to(x_ptr.dtype.element_ty) + x_out_offs = ( + b * stride_out_b + + s_offs[:, None] * stride_out_s + + h * stride_out_h + + d_offs[None, :] * stride_out_d + ) + + tl.store(out_ptr + x_out_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs - BLOCK_D * stride_out_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_ptr + x_out_offs + BLOCK_D * stride_out_d, x, mask=x_mask) + + +@triton.jit +def _rope_kernel_thd_cached_2c_fwd( + x_ptr, + y_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_x_ptr, + out_y_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_y_t, + stride_y_h, + stride_y_d, + stride_cos_t, + stride_cos_d, + stride_pos_t, + stride_out_x_t, + stride_out_x_h, + stride_out_x_d, + stride_out_y_t, + stride_out_y_h, + stride_out_y_d, + T, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_T: tl.constexpr, + SPLIT_H_SIZE: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, + num_stages: tl.constexpr, +): + h_s = tl.program_id(0) + pid_t = tl.program_id(1) + + tl.assume(stride_x_t > 0) + tl.assume(stride_x_h > 0) + tl.assume(stride_x_d > 0) + tl.assume(stride_y_t > 0) + tl.assume(stride_y_h > 0) + tl.assume(stride_y_d > 0) + tl.assume(stride_cos_t > 0) + tl.assume(stride_cos_d > 0) + tl.assume(stride_pos_t > 0) + tl.assume(stride_out_x_t > 0) + tl.assume(stride_out_x_h > 0) + tl.assume(stride_out_x_d > 0) + tl.assume(stride_out_y_t > 0) + tl.assume(stride_out_y_h > 0) + tl.assume(stride_out_y_d > 0) + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if HAVE_POS: + pos_offs = t_offs * stride_pos_t + pos = tl.load(pos_ptr + pos_offs, mask=t_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=t_mask) + t_cos_offs = pos + offset + else: + t_cos_offs = pos + else: + t_cos_offs = t_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs < BLOCK_D_HALF), + d_cos_offs, + d_cos_offs - BLOCK_D_HALF, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = tl.arange(0, BLOCK_D) // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = t_mask[:, None] & d_cos_mask[None, :] + cos_offs = t_cos_offs[:, None] * stride_cos_t + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + h_start_idx = h_s * SPLIT_H_SIZE + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + for h in tl.range(0, SPLIT_H_SIZE, 1, num_stages=num_stages): + x_offs = ( + t_offs[:, None] * stride_x_t + + d_offs[None, :] * stride_x_d + + (h_start_idx + h) * stride_x_h + ) + y_offs = ( + t_offs[:, None] * stride_y_t + + d_offs[None, :] * stride_y_d + + (h_start_idx + h) * stride_y_h + ) + + x = tl.load(x_ptr + x_offs, mask=x_mask) + y = tl.load(y_ptr + y_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + y_rotated = _get_neox_rotated_x( + y, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + else: + x_rotated = _get_gptj_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + y_rotated = _get_gptj_rotated_x( + y, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + + out_x = x * cos + x_rotated * sin + out_x = out_x.to(x_ptr.dtype.element_ty) + out_y = y * cos + y_rotated * sin + out_y = out_y.to(y_ptr.dtype.element_ty) + + out_x_offs = ( + t_offs[:, None] * stride_out_x_t + + d_offs[None, :] * stride_out_x_d + + (h_start_idx + h) * stride_out_x_h + ) + out_y_offs = ( + t_offs[:, None] * stride_out_y_t + + d_offs[None, :] * stride_out_y_d + + (h_start_idx + h) * stride_out_y_h + ) + tl.store(out_x_ptr + out_x_offs, out_x, mask=x_mask) + tl.store(out_y_ptr + out_y_offs, out_y, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs - BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + y = tl.load(y_ptr + y_offs - BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs - BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs + BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + y = tl.load(y_ptr + y_offs + BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs + BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + + +@triton.jit +def _rope_kernel_thd_cached_2c_bwd( + x_ptr, + y_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_x_ptr, + out_y_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_y_t, + stride_y_h, + stride_y_d, + stride_cos_t, + stride_cos_d, + stride_pos_t, + stride_out_x_t, + stride_out_x_h, + stride_out_x_d, + stride_out_y_t, + stride_out_y_h, + stride_out_y_d, + T, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_T: tl.constexpr, + SPLIT_H_SIZE: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, + num_stages: tl.constexpr, +): + h_s = tl.program_id(0) + pid_t = tl.program_id(1) + + tl.assume(stride_x_t > 0) + tl.assume(stride_x_h > 0) + tl.assume(stride_x_d > 0) + tl.assume(stride_y_t > 0) + tl.assume(stride_y_h > 0) + tl.assume(stride_y_d > 0) + tl.assume(stride_cos_t > 0) + tl.assume(stride_cos_d > 0) + tl.assume(stride_pos_t > 0) + tl.assume(stride_out_x_t > 0) + tl.assume(stride_out_x_h > 0) + tl.assume(stride_out_x_d > 0) + tl.assume(stride_out_y_t > 0) + tl.assume(stride_out_y_h > 0) + tl.assume(stride_out_y_d > 0) + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if HAVE_POS: + pos_offs = t_offs * stride_pos_t + pos = tl.load(pos_ptr + pos_offs, mask=t_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=t_mask) + t_cos_offs = pos + offset + else: + t_cos_offs = pos + else: + t_cos_offs = t_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs < BLOCK_D_HALF), + d_cos_offs, + d_cos_offs - BLOCK_D_HALF, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = tl.arange(0, BLOCK_D) // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = t_mask[:, None] & d_cos_mask[None, :] + cos_offs = t_cos_offs[:, None] * stride_cos_t + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + h_start_idx = h_s * SPLIT_H_SIZE + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + for h in tl.range(0, SPLIT_H_SIZE, 1, num_stages=num_stages): + x_offs = ( + t_offs[:, None] * stride_x_t + + d_offs[None, :] * stride_x_d + + (h_start_idx + h) * stride_x_h + ) + y_offs = ( + t_offs[:, None] * stride_y_t + + d_offs[None, :] * stride_y_d + + (h_start_idx + h) * stride_y_h + ) + + x = tl.load(x_ptr + x_offs, mask=x_mask) + y = tl.load(y_ptr + y_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + y_rotated = _get_neox_rotated_x( + y * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + else: + x_rotated = _get_gptj_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + y_rotated = _get_gptj_rotated_x( + y * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + + out_x = x * cos + x_rotated + out_x = out_x.to(x_ptr.dtype.element_ty) + out_y = y * cos + y_rotated + out_y = out_y.to(y_ptr.dtype.element_ty) + + out_x_offs = ( + t_offs[:, None] * stride_out_x_t + + d_offs[None, :] * stride_out_x_d + + (h_start_idx + h) * stride_out_x_h + ) + out_y_offs = ( + t_offs[:, None] * stride_out_y_t + + d_offs[None, :] * stride_out_y_d + + (h_start_idx + h) * stride_out_y_h + ) + tl.store(out_x_ptr + out_x_offs, out_x, mask=x_mask) + tl.store(out_y_ptr + out_y_offs, out_y, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + # TODO check + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs - BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + y = tl.load(y_ptr + y_offs - BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs - BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs + BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + y = tl.load(y_ptr + y_offs + BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs + BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + + +@triton.jit +def _rope_kernel_cached_thd_2c_gqa_fwd( + x_ptr, + y_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_x_ptr, + out_y_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_y_t, + stride_y_h, + stride_y_d, + stride_cos_t, + stride_cos_d, + stride_pos_t, + stride_out_x_t, + stride_out_x_h, + stride_out_x_d, + stride_out_y_t, + stride_out_y_h, + stride_out_y_d, + T, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_T: tl.constexpr, + QH_per_G: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, + num_stages: tl.constexpr, +): + h_s = tl.program_id(0) + pid_t = tl.program_id(1) + + tl.assume(stride_x_t > 0) + tl.assume(stride_x_h > 0) + tl.assume(stride_x_d > 0) + tl.assume(stride_y_t > 0) + tl.assume(stride_y_h > 0) + tl.assume(stride_y_d > 0) + tl.assume(stride_cos_t > 0) + tl.assume(stride_cos_d > 0) + tl.assume(stride_pos_t > 0) + tl.assume(stride_out_x_t > 0) + tl.assume(stride_out_x_h > 0) + tl.assume(stride_out_x_d > 0) + tl.assume(stride_out_y_t > 0) + tl.assume(stride_out_y_h > 0) + tl.assume(stride_out_y_d > 0) + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if HAVE_POS: + pos_offs = t_offs * stride_pos_t + pos = tl.load(pos_ptr + pos_offs, mask=t_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=t_mask) + t_cos_offs = pos + offset + else: + t_cos_offs = pos + else: + t_cos_offs = t_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs < BLOCK_D_HALF), + d_cos_offs, + d_cos_offs - BLOCK_D_HALF, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = tl.arange(0, BLOCK_D) // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = t_mask[:, None] & d_cos_mask[None, :] + cos_offs = t_cos_offs[:, None] * stride_cos_t + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + h_start_idx = h_s * QH_per_G + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + y_offs = ( + t_offs[:, None] * stride_y_t + d_offs[None, :] * stride_y_d + h_s * stride_y_h + ) + y = tl.load(y_ptr + y_offs, mask=x_mask) + + if IS_NEOX: + y_rotated = _get_neox_rotated_x( + y, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + else: + y_rotated = _get_gptj_rotated_x( + y, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + + out_y_offs = ( + t_offs[:, None] * stride_out_y_t + + d_offs[None, :] * stride_out_y_d + + h_s * stride_out_y_h + ) + out_y = y * cos + y_rotated * sin + out_y = out_y.to(y_ptr.dtype.element_ty) + tl.store(out_y_ptr + out_y_offs, out_y, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + y = tl.load(y_ptr + y_offs - BLOCK_D * stride_y_d, mask=x_mask) + tl.store(out_y_ptr + out_y_offs - BLOCK_D * stride_out_y_d, y, mask=x_mask) + else: + y = tl.load(y_ptr + y_offs + BLOCK_D * stride_y_d, mask=x_mask) + tl.store(out_y_ptr + out_y_offs + BLOCK_D * stride_out_y_d, y, mask=x_mask) + + for h in tl.range(0, QH_per_G, 1, num_stages=num_stages): + x_offs = ( + t_offs[:, None] * stride_x_t + + d_offs[None, :] * stride_x_d + + (h_start_idx + h) * stride_x_h + ) + + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + else: + x_rotated = _get_gptj_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + + out_x_offs = ( + t_offs[:, None] * stride_out_x_t + + d_offs[None, :] * stride_out_x_d + + (h_start_idx + h) * stride_out_x_h + ) + out_x = x * cos + x_rotated * sin + out_x = out_x.to(x_ptr.dtype.element_ty) + + tl.store(out_x_ptr + out_x_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs - BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs + BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + + +@triton.jit +def _rope_kernel_cached_thd_2c_gqa_onehead_fwd( + x_ptr, + y_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_x_ptr, + out_y_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_y_t, + stride_y_h, + stride_y_d, + stride_cos_t, + stride_cos_d, + stride_pos_t, + stride_out_x_t, + stride_out_x_h, + stride_out_x_d, + stride_out_y_t, + stride_out_y_h, + stride_out_y_d, + T, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_T: tl.constexpr, + G: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + pid_t = tl.program_id(0) + hq = tl.program_id(1) + + tl.assume(stride_x_t > 0) + tl.assume(stride_x_h > 0) + tl.assume(stride_x_d > 0) + tl.assume(stride_y_t > 0) + tl.assume(stride_y_h > 0) + tl.assume(stride_y_d > 0) + tl.assume(stride_cos_t > 0) + tl.assume(stride_cos_d > 0) + tl.assume(stride_pos_t > 0) + tl.assume(stride_out_x_t > 0) + tl.assume(stride_out_x_h > 0) + tl.assume(stride_out_x_d > 0) + tl.assume(stride_out_y_t > 0) + tl.assume(stride_out_y_h > 0) + tl.assume(stride_out_y_d > 0) + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if HAVE_POS: + pos_offs = t_offs * stride_pos_t + pos = tl.load(pos_ptr + pos_offs, mask=t_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=t_mask) + t_cos_offs = pos + offset + else: + t_cos_offs = pos + else: + t_cos_offs = t_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs < BLOCK_D_HALF), + d_cos_offs, + d_cos_offs - BLOCK_D_HALF, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = tl.arange(0, BLOCK_D) // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = t_mask[:, None] & d_cos_mask[None, :] + cos_offs = t_cos_offs[:, None] * stride_cos_t + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + x_offs = ( + t_offs[:, None] * stride_x_t + d_offs[None, :] * stride_x_d + hq * stride_x_h + ) + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + else: + x_rotated = _get_gptj_rotated_x( + x, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + + out_x_offs = ( + t_offs[:, None] * stride_out_x_t + + d_offs[None, :] * stride_out_x_d + + hq * stride_out_x_h + ) + out_x = x * cos + x_rotated * sin + out_x = out_x.to(x_ptr.dtype.element_ty) + tl.store(out_x_ptr + out_x_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_x_ptr + out_x_offs - BLOCK_D * stride_out_x_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_x_ptr + out_x_offs + BLOCK_D * stride_out_x_d, x, mask=x_mask) + + if hq < G: + y_offs = ( + t_offs[:, None] * stride_y_t + + d_offs[None, :] * stride_x_d + + hq * stride_y_h + ) + y = tl.load(y_ptr + y_offs, mask=x_mask) + + if IS_NEOX: + y_rotated = _get_neox_rotated_x( + y, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + else: + y_rotated = _get_gptj_rotated_x( + y, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF + ) + + out_y_offs = ( + t_offs[:, None] * stride_out_y_t + + d_offs[None, :] * stride_out_y_d + + hq * stride_out_y_h + ) + out_y = y * cos + y_rotated * sin + out_y = out_y.to(y_ptr.dtype.element_ty) + tl.store(out_y_ptr + out_y_offs, out_y, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + y = tl.load(y_ptr + y_offs - BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs - BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + else: + y = tl.load(y_ptr + y_offs + BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs + BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + + +@triton.jit +def _rope_kernel_cached_thd_2c_gqa_bwd( + x_ptr, + y_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_x_ptr, + out_y_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_y_t, + stride_y_h, + stride_y_d, + stride_cos_t, + stride_cos_d, + stride_pos_t, + stride_out_x_t, + stride_out_x_h, + stride_out_x_d, + stride_out_y_t, + stride_out_y_h, + stride_out_y_d, + T, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_T: tl.constexpr, + QH_per_G: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, + num_stages: tl.constexpr, +): + h_s = tl.program_id(0) + pid_t = tl.program_id(1) + + tl.assume(stride_x_t > 0) + tl.assume(stride_x_h > 0) + tl.assume(stride_x_d > 0) + tl.assume(stride_y_t > 0) + tl.assume(stride_y_h > 0) + tl.assume(stride_y_d > 0) + tl.assume(stride_cos_t > 0) + tl.assume(stride_cos_d > 0) + tl.assume(stride_pos_t > 0) + tl.assume(stride_out_x_t > 0) + tl.assume(stride_out_x_h > 0) + tl.assume(stride_out_x_d > 0) + tl.assume(stride_out_y_t > 0) + tl.assume(stride_out_y_h > 0) + tl.assume(stride_out_y_d > 0) + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if HAVE_POS: + pos_offs = t_offs * stride_pos_t + pos = tl.load(pos_ptr + pos_offs, mask=t_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=t_mask) + t_cos_offs = pos + offset + else: + t_cos_offs = pos + else: + t_cos_offs = t_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs < BLOCK_D_HALF), + d_cos_offs, + d_cos_offs - BLOCK_D_HALF, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = tl.arange(0, BLOCK_D) // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = t_mask[:, None] & d_cos_mask[None, :] + cos_offs = t_cos_offs[:, None] * stride_cos_t + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + h_start_idx = h_s * QH_per_G + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + y_offs = ( + t_offs[:, None] * stride_y_t + d_offs[None, :] * stride_y_d + h_s * stride_y_h + ) + y = tl.load(y_ptr + y_offs, mask=x_mask) + + if IS_NEOX: + y_rotated = _get_neox_rotated_x( + y * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + else: + y_rotated = _get_gptj_rotated_x( + y * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + + out_y_offs = ( + t_offs[:, None] * stride_out_y_t + + d_offs[None, :] * stride_out_y_d + + h_s * stride_out_y_h + ) + out_y = y * cos + y_rotated + out_y = out_y.to(y_ptr.dtype.element_ty) + tl.store(out_y_ptr + out_y_offs, out_y, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + y = tl.load(y_ptr + y_offs - BLOCK_D * stride_y_d, mask=x_mask) + tl.store(out_y_ptr + out_y_offs - BLOCK_D * stride_out_y_d, y, mask=x_mask) + else: + y = tl.load(y_ptr + y_offs + BLOCK_D * stride_y_d, mask=x_mask) + tl.store(out_y_ptr + out_y_offs + BLOCK_D * stride_out_y_d, y, mask=x_mask) + + for h in tl.range(0, QH_per_G, 1, num_stages=num_stages): + x_offs = ( + t_offs[:, None] * stride_x_t + + d_offs[None, :] * stride_x_d + + (h_start_idx + h) * stride_x_h + ) + + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + else: + x_rotated = _get_gptj_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + + out_x_offs = ( + t_offs[:, None] * stride_out_x_t + + d_offs[None, :] * stride_out_x_d + + (h_start_idx + h) * stride_out_x_h + ) + out_x = x * cos + x_rotated + out_x = out_x.to(x_ptr.dtype.element_ty) + + tl.store(out_x_ptr + out_x_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs - BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store( + out_x_ptr + out_x_offs + BLOCK_D * stride_out_x_d, x, mask=x_mask + ) + + +@triton.jit +def _rope_kernel_cached_thd_2c_gqa_onehead_bwd( + x_ptr, + y_ptr, + cos_ptr, + sin_ptr, + pos_ptr, + off_ptr, + out_x_ptr, + out_y_ptr, + stride_x_t, + stride_x_h, + stride_x_d, + stride_y_t, + stride_y_h, + stride_y_d, + stride_cos_t, + stride_cos_d, + stride_pos_t, + stride_out_x_t, + stride_out_x_h, + stride_out_x_d, + stride_out_y_t, + stride_out_y_h, + stride_out_y_d, + T, + HAVE_NOPE: tl.constexpr, + NOPE_FIRST: tl.constexpr, + INPLACE: tl.constexpr, + REUSE_FREQS_FRONT_PART: tl.constexpr, + IS_NEOX: tl.constexpr, + HAVE_POS: tl.constexpr, + HAVE_OFFS: tl.constexpr, + BLOCK_T: tl.constexpr, + G: tl.constexpr, + BLOCK_D: tl.constexpr, + BLOCK_D_HALF: tl.constexpr, +): + pid_t = tl.program_id(0) + hq = tl.program_id(1) + + tl.assume(stride_x_t > 0) + tl.assume(stride_x_h > 0) + tl.assume(stride_x_d > 0) + tl.assume(stride_y_t > 0) + tl.assume(stride_y_h > 0) + tl.assume(stride_y_d > 0) + tl.assume(stride_cos_t > 0) + tl.assume(stride_cos_d > 0) + tl.assume(stride_pos_t > 0) + tl.assume(stride_out_x_t > 0) + tl.assume(stride_out_x_h > 0) + tl.assume(stride_out_x_d > 0) + tl.assume(stride_out_y_t > 0) + tl.assume(stride_out_y_h > 0) + tl.assume(stride_out_y_d > 0) + + t_offs = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) + d_offs = tl.arange(0, BLOCK_D) + t_mask = t_offs < T + + if HAVE_POS: + pos_offs = t_offs * stride_pos_t + pos = tl.load(pos_ptr + pos_offs, mask=t_mask) + if HAVE_OFFS: + offset = tl.load(off_ptr + pos_offs, mask=t_mask) + t_cos_offs = pos + offset + else: + t_cos_offs = pos + else: + t_cos_offs = t_offs + + if REUSE_FREQS_FRONT_PART: + if IS_NEOX: + d_cos_offs = d_offs + d_cos_offs = tl.where( + (d_cos_offs < BLOCK_D_HALF), + d_cos_offs, + d_cos_offs - BLOCK_D_HALF, + ).to(d_cos_offs.dtype) + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = tl.arange(0, BLOCK_D) // 2 + d_cos_mask = d_cos_offs < BLOCK_D_HALF + else: + d_cos_offs = d_offs + d_cos_mask = d_cos_offs < BLOCK_D + + cos_mask = t_mask[:, None] & d_cos_mask[None, :] + cos_offs = t_cos_offs[:, None] * stride_cos_t + d_cos_offs[None, :] * stride_cos_d + cos = tl.load(cos_ptr + cos_offs, mask=cos_mask) + sin = tl.load(sin_ptr + cos_offs, mask=cos_mask) + + nope_offs = 0 + if HAVE_NOPE and NOPE_FIRST: + nope_offs = BLOCK_D + + x_mask = t_mask[:, None] & (d_offs < BLOCK_D)[None, :] + + if IS_NEOX: + x_rotated_mask = (d_offs < BLOCK_D_HALF)[None, :] + else: + x_rotated_mask = (d_offs % 2 == 0)[None, :] + + d_offs += nope_offs + x_offs = ( + t_offs[:, None] * stride_x_t + d_offs[None, :] * stride_x_d + hq * stride_x_h + ) + x = tl.load(x_ptr + x_offs, mask=x_mask) + + if IS_NEOX: + x_rotated = _get_neox_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + else: + x_rotated = _get_gptj_rotated_x( + x * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + + out_x_offs = ( + t_offs[:, None] * stride_out_x_t + + d_offs[None, :] * stride_out_x_d + + hq * stride_out_x_h + ) + out_x = x * cos + x_rotated + out_x = out_x.to(x_ptr.dtype.element_ty) + tl.store(out_x_ptr + out_x_offs, out_x, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + x = tl.load(x_ptr + x_offs - BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_x_ptr + out_x_offs - BLOCK_D * stride_out_x_d, x, mask=x_mask) + else: + x = tl.load(x_ptr + x_offs + BLOCK_D * stride_x_d, mask=x_mask) + tl.store(out_x_ptr + out_x_offs + BLOCK_D * stride_out_x_d, x, mask=x_mask) + + if hq < G: + y_offs = ( + t_offs[:, None] * stride_y_t + + d_offs[None, :] * stride_x_d + + hq * stride_y_h + ) + y = tl.load(y_ptr + y_offs, mask=x_mask) + + if IS_NEOX: + y_rotated = _get_neox_rotated_x( + y * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + else: + y_rotated = _get_gptj_rotated_x( + y * sin, x_rotated_mask, BLOCK_T, BLOCK_D, BLOCK_D_HALF, True + ) + + out_y_offs = ( + t_offs[:, None] * stride_out_y_t + + d_offs[None, :] * stride_out_y_d + + hq * stride_out_y_h + ) + out_y = y * cos + y_rotated + out_y = out_y.to(y_ptr.dtype.element_ty) + tl.store(out_y_ptr + out_y_offs, out_y, mask=x_mask) + + if HAVE_NOPE and not INPLACE: + if NOPE_FIRST: + y = tl.load(y_ptr + y_offs - BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs - BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + else: + y = tl.load(y_ptr + y_offs + BLOCK_D * stride_y_d, mask=x_mask) + tl.store( + out_y_ptr + out_y_offs + BLOCK_D * stride_out_y_d, y, mask=x_mask + ) + + +@triton.jit +def _rope_fwd_2d_kernel_neox( + x_ptr, + cos_h_ptr, + sin_h_ptr, + cos_w_ptr, + sin_w_ptr, + out_ptr, + stride_x_b, + stride_x_wh, + stride_x_h, + stride_x_d, + stride_cos_h_b, + stride_cos_h_ht, + stride_cos_h_h, + stride_cos_h_d, + stride_cos_w_b, + stride_cos_w_w, + stride_cos_w_h, + stride_cos_w_d, + WH: tl.constexpr, + HEIGHT: tl.constexpr, + WEIGHT: tl.constexpr, + BLOCK_D: tl.constexpr, +): + b = tl.program_id(0) + h = tl.program_id(1) + + # load cos_h [HT, BLOCK_D] + offs_wh = tl.arange(0, WH) + offs_cos_h_h = offs_wh // WEIGHT + offs_d = tl.arange(0, BLOCK_D) + offs_cos_h = ( + stride_cos_h_h * offs_cos_h_h[:, None] + stride_cos_h_d * offs_d[None, :] + ) + mask_cos_h = offs_d < BLOCK_D // 2 + cos_h = tl.load(cos_h_ptr + offs_cos_h, mask=mask_cos_h[None, :]) + + # load sin_h + sin_h = tl.load(sin_h_ptr + offs_cos_h, mask=mask_cos_h[None, :]) + + # load cos_w + offs_cos_w_w = offs_wh % WEIGHT + offs_cos_w_d = offs_d - BLOCK_D // 2 + offs_cos_w = ( + stride_cos_w_w * offs_cos_w_w[:, None] + stride_cos_w_d * offs_cos_w_d[None, :] + ) + mask_cos_w = (offs_cos_w_d >= 0) & (offs_cos_w_d < BLOCK_D // 2) + cos_w = tl.load(cos_w_ptr + offs_cos_w, mask=mask_cos_w[None, :]) + + # load sin_w + sin_w = tl.load(sin_w_ptr + offs_cos_w, mask=mask_cos_w[None, :]) + + # load x + offs_wh = tl.arange(0, WH) + offs_x = ( + stride_x_b * b + + stride_x_wh * offs_wh[:, None] + + stride_x_h * h + + stride_x_d * offs_d[None, :] + ) + x = tl.load(x_ptr + offs_x) + + # load x_rotated + offs_wh = tl.arange(0, WH) + offs_d_rotated = tl.where(offs_d < BLOCK_D // 4, offs_d + BLOCK_D // 4, offs_d) + offs_d_rotated = tl.where( + (offs_d >= BLOCK_D // 4) & (offs_d < BLOCK_D // 2), + offs_d_rotated - BLOCK_D // 4, + offs_d_rotated, + ) + offs_d_rotated = tl.where( + (offs_d >= BLOCK_D // 2) & (offs_d < 3 * BLOCK_D // 4), + offs_d_rotated + BLOCK_D // 4, + offs_d_rotated, + ) + offs_d_rotated = tl.where( + (offs_d >= 3 * BLOCK_D // 4) & (offs_d < BLOCK_D), + offs_d_rotated - BLOCK_D // 4, + offs_d_rotated, + ) + offs_x_rotated = ( + stride_x_b * b + + stride_x_wh * offs_wh[:, None] + + stride_x_h * h + + stride_x_d * offs_d_rotated[None, :] + ) + x_rotated = tl.load(x_ptr + offs_x_rotated) + neg_x_rotated = tl.where((offs_d >= BLOCK_D // 4) & (offs_d < BLOCK_D // 2), 1, 0) + neg_x_rotated = tl.where( + (offs_d >= 3 * BLOCK_D // 4) & (offs_d < BLOCK_D), 1, neg_x_rotated + ) + x_rotated = tl.where(neg_x_rotated, x_rotated, -x_rotated) + + # compute x1 + x1 = x * cos_h + x_rotated * sin_h + + # compute x2 + x2 = x * cos_w + x_rotated * sin_w + + # compute output + out = x1 + x2 + + # store output + tl.store(out_ptr + offs_x, out) + + +# TODO: For now BLOCK_D is assumed to be power of 2. Expand to handle other value of D. +def _rope_fwd( + x: torch.Tensor, + out: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + + if freqs.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif freqs.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + # TODO: performance optimization + BLOCK_S = 32 + num_warps = 4 + waves_per_eu = 0 + grid = (b, h, triton.cdiv(s, BLOCK_S)) + + _rope_kernel_sbhd_fwd[grid]( + x, + freqs, + out, + *x.stride(), + *freqs.stride(), + *out.stride(), + s, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + BLOCK_S=BLOCK_S, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out + + +def rope_fwd( + x: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_fwd( + x, + out, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def rope_fwd_inplace( + x: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + out = x + + _rope_fwd( + x, + out, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + True, + transpose_output, + ) + + return out + + +def _rope_bwd( + x: torch.Tensor, + out: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + + if freqs.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif freqs.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + # TODO: performance optimization + BLOCK_S = 32 + num_warps = 4 + waves_per_eu = 0 + grid = (b, h, triton.cdiv(s, BLOCK_S)) + + _rope_kernel_sbhd_bwd[grid]( + x, + freqs, + out, + *x.stride(), + *freqs.stride(), + *out.stride(), + s, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + BLOCK_S=BLOCK_S, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out + + +def rope_bwd( + x: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_bwd( + x, + out, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def _rope_thd_fwd( + x: torch.Tensor, + out: torch.Tensor, + cu_seqlens: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +) -> torch.Tensor: + b = torch.numel(cu_seqlens) - 1 + t, h, d = x.shape + + if freqs.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif freqs.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + # TODO: performance optimization + BLOCK_T = 32 + num_warps = 4 + waves_per_eu = 0 + grid = (b, h, triton.cdiv(t, BLOCK_T)) + + _rope_kernel_thd_fwd[grid]( + x, + cu_seqlens, + freqs, + out, + *x.stride(), + *freqs.stride(), + *out.stride(), + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + BLOCK_T=BLOCK_T, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out + + +def rope_thd_fwd( + x: torch.Tensor, + cu_seqlens: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + t, h, d = x.shape + out = torch.empty((t, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_thd_fwd( + x, + out, + cu_seqlens, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def rope_thd_fwd_inplace( + x: torch.Tensor, + cu_seqlens: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + out = x + + _rope_thd_fwd( + x, + out, + cu_seqlens, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + True, + transpose_output, + ) + + return out + + +def _rope_thd_bwd( + x: torch.Tensor, + out: torch.Tensor, + cu_seqlens: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +) -> torch.Tensor: + b = torch.numel(cu_seqlens) - 1 + t, h, d = x.shape + + if freqs.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif freqs.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + # TODO: performance optimization + BLOCK_T = 32 + num_warps = 4 + waves_per_eu = 0 + grid = (b, h, triton.cdiv(t, BLOCK_T)) + + _rope_kernel_thd_bwd[grid]( + x, + cu_seqlens, + freqs, + out, + *x.stride(), + *freqs.stride(), + *out.stride(), + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + BLOCK_T=BLOCK_T, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out + + +def rope_thd_bwd( + x: torch.Tensor, + cu_seqlens: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + t, h, d = x.shape + out = torch.empty((t, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_thd_bwd( + x, + out, + cu_seqlens, + freqs, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +# TODO: For now BLOCK_D is assumed to be power of 2. Expand to handle other value of D. +def _rope_cached_fwd( + x: torch.Tensor, + out: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + + if cos.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif cos.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + # TODO: performance optimization + BLOCK_S = 32 + num_warps = 4 + waves_per_eu = 0 + grid = (b, h, triton.cdiv(s, BLOCK_S)) + + pos_stride = positions.stride() if positions is not None else (1, 1) + _rope_kernel_sbhd_cached_fwd[grid]( + x, + cos, + sin, + positions, + offsets, + out, + *x.stride(), + *cos.stride(), + *pos_stride, + *out.stride(), + s, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_S=BLOCK_S, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out + + +def rope_cached_fwd( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_fwd( + x, + out, + cos, + sin, + None, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def rope_cached_fwd_inplace( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out = x + + _rope_cached_fwd( + x, + out, + cos, + sin, + None, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + True, + transpose_output, + ) + + return out + + +def rope_cached_positions_fwd( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_fwd( + x, + out, + cos, + sin, + positions, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def rope_cached_positions_fwd_inplace( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + out = x + + _rope_cached_fwd( + x, + out, + cos, + sin, + positions, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + True, + transpose_output, + ) + + return out + + +def rope_cached_positions_offsets_fwd( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_fwd( + x, + out, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def rope_cached_positions_offsets_fwd_inplace( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + out = x + + _rope_cached_fwd( + x, + out, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + True, + transpose_output, + ) + + return out + + +def _rope_cached_bwd( + x: torch.Tensor, + out: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + + if cos.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif cos.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + # TODO: performance optimization + BLOCK_S = 32 + num_warps = 4 + waves_per_eu = 0 + grid = (b, h, triton.cdiv(s, BLOCK_S)) + + pos_stride = positions.stride() if positions is not None else (1, 1) + _rope_kernel_sbhd_cached_bwd[grid]( + x, + cos, + sin, + positions, + offsets, + out, + *x.stride(), + *cos.stride(), + *pos_stride, + *out.stride(), + s, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_S=BLOCK_S, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out + + +def rope_cached_bwd( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_bwd( + x, + out, + cos, + sin, + None, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def rope_cached_positions_bwd( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_bwd( + x, + out, + cos, + sin, + positions, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def rope_cached_positions_offsets_bwd( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +) -> torch.Tensor: + s, b, h, d = x.shape + out = torch.empty((s, b, h, d), dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_bwd( + x, + out, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out + + +def _rope_cached_thd_2c_fwd( + x: torch.Tensor, + y: torch.Tensor, + out_x: torch.Tensor, + out_y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +): + t, h, d = x.shape + ty, kh, dy = y.shape + + assert ( + t == ty + ), f"The number of tokens should be the same for the two inputs, but got {t} and {ty}" + assert ( + d == dy + ), f"The head dimension should be the same for the two inputs, but got {d} and {dy}" + assert h % kh == 0, f"QH should be multiple of KH, but got QH={h} and KH={kh}" + + if cos.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif cos.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + if h == kh: + BLOCK_T = 32 + SPLIT_T = (triton.next_power_of_2(t) + BLOCK_T - 1) // BLOCK_T + + if t >= 8192: + MIN_NUM_WG = 4096 + elif t >= 1024: + MIN_NUM_WG = 1024 + else: + MIN_NUM_WG = 512 + + if SPLIT_T < MIN_NUM_WG: + SPLIT_H_SIZE = h + SPLIT_H = (triton.next_power_of_2(h) + SPLIT_H_SIZE - 1) // SPLIT_H_SIZE + while SPLIT_H * SPLIT_T < MIN_NUM_WG and SPLIT_H_SIZE > 1: + SPLIT_H_SIZE = SPLIT_H_SIZE // 2 + SPLIT_H = (triton.next_power_of_2(h) + SPLIT_H_SIZE - 1) // SPLIT_H_SIZE + else: + SPLIT_H_SIZE = h + + SPLIT_H = (triton.next_power_of_2(h) + SPLIT_H_SIZE - 1) // SPLIT_H_SIZE + grid = (SPLIT_H, SPLIT_T, 1) + num_warps = 4 + waves_per_eu = 0 + num_stages = 2 if SPLIT_H_SIZE > 1 else 1 + + _rope_kernel_thd_cached_2c_fwd[grid]( + x, + y, + cos, + sin, + positions, + offsets, + out_x, + out_y, + *x.stride(), + *y.stride(), + *cos.stride(), + *positions.stride(), + *out_x.stride(), + *out_y.stride(), + t, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_T=BLOCK_T, + SPLIT_H_SIZE=SPLIT_H_SIZE, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + num_stages=num_stages, + ) + else: + # TODO check boundary + if rotate_style == RotateStyle.GPTJ and t >= 1024: + BLOCK_T = 32 + SPLIT_T = (triton.next_power_of_2(t) + BLOCK_T - 1) // BLOCK_T + QH_per_G = h // kh + grid = (kh, SPLIT_T, 1) + num_warps = 4 + waves_per_eu = 0 + num_stages = 2 if QH_per_G > 1 else 1 + + _rope_kernel_cached_thd_2c_gqa_fwd[grid]( + x, + y, + cos, + sin, + positions, + offsets, + out_x, + out_y, + *x.stride(), + *y.stride(), + *cos.stride(), + *positions.stride(), + *out_x.stride(), + *out_y.stride(), + t, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_T=BLOCK_T, + QH_per_G=QH_per_G, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + num_stages=num_stages, + ) + else: + BLOCK_T = min(max(triton.next_power_of_2(t), 16), 32) + SPLIT_T = (triton.next_power_of_2(t) + BLOCK_T - 1) // BLOCK_T + grid = (SPLIT_T, h, 1) + num_warps = 4 + waves_per_eu = 0 + _rope_kernel_cached_thd_2c_gqa_onehead_fwd[grid]( + x, + y, + cos, + sin, + positions, + offsets, + out_x, + out_y, + *x.stride(), + *y.stride(), + *cos.stride(), + *positions.stride(), + *out_x.stride(), + *out_y.stride(), + t, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_T=BLOCK_T, + G=kh, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out_x, out_y + + +def rope_cached_thd_positions_2c_fwd( + x: torch.Tensor, + y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out_x = torch.empty(*x.shape, dtype=x.dtype, device=x.device, requires_grad=False) + out_y = torch.empty(*y.shape, dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_thd_2c_fwd( + x, + y, + out_x, + out_y, + cos, + sin, + positions, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out_x, out_y + + +def rope_cached_thd_positions_2c_fwd_inplace( + x: torch.Tensor, + y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out_x = x + out_y = y + + _rope_cached_thd_2c_fwd( + x, + y, + out_x, + out_y, + cos, + sin, + positions, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + True, + transpose_output, + ) + + return out_x, out_y + + +def rope_cached_thd_positions_offsets_2c_fwd( + x: torch.Tensor, + y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out_x = torch.empty(*x.shape, dtype=x.dtype, device=x.device, requires_grad=False) + out_y = torch.empty(*y.shape, dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_thd_2c_fwd( + x, + y, + out_x, + out_y, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out_x, out_y + + +def rope_cached_thd_positions_offsets_2c_fwd_inplace( + x: torch.Tensor, + y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out_x = x + out_y = y + + _rope_cached_thd_2c_fwd( + x, + y, + out_x, + out_y, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + True, + transpose_output, + ) + + return out_x, out_y + + +def _rope_cached_thd_positions_offsets_2c_bwd( + x: torch.Tensor, + y: torch.Tensor, + out_x: torch.Tensor, + out_y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + inplace: bool, + transpose_output: bool = False, +): + t, h, d = x.shape + ty, kh, dy = y.shape + + assert ( + t == ty + ), f"The number of tokens should be the same for the two inputs, but got {t} and {ty}" + assert ( + d == dy + ), f"The head dimension should be the same for the two inputs, but got {d} and {dy}" + assert h % kh == 0, f"QH should be multiple of KH, but got QH={h} and KH={kh}" + + if cos.shape[-1] == d // 2: + if reuse_freqs_front_part: + have_nope = False + else: + have_nope = True + elif cos.shape[-1] == d // 4: + have_nope = True + else: + have_nope = False + + if have_nope: + BLOCK_D = d // 2 + BLOCK_D_HALF = d // 4 + else: + BLOCK_D = d + BLOCK_D_HALF = d // 2 + + if h == kh: + BLOCK_T = 32 + SPLIT_T = (triton.next_power_of_2(t) + BLOCK_T - 1) // BLOCK_T + + if t >= 8192: + MIN_NUM_WG = 4096 + elif t >= 1024: + MIN_NUM_WG = 1024 + else: + MIN_NUM_WG = 512 + + if SPLIT_T < MIN_NUM_WG: + SPLIT_H_SIZE = h + SPLIT_H = (triton.next_power_of_2(h) + SPLIT_H_SIZE - 1) // SPLIT_H_SIZE + while SPLIT_H * SPLIT_T < MIN_NUM_WG and SPLIT_H_SIZE > 1: + SPLIT_H_SIZE = SPLIT_H_SIZE // 2 + SPLIT_H = (triton.next_power_of_2(h) + SPLIT_H_SIZE - 1) // SPLIT_H_SIZE + else: + SPLIT_H_SIZE = h + + SPLIT_H = (triton.next_power_of_2(h) + SPLIT_H_SIZE - 1) // SPLIT_H_SIZE + grid = (SPLIT_H, SPLIT_T, 1) + num_warps = 4 + waves_per_eu = 0 + num_stages = 2 if SPLIT_H_SIZE > 1 else 1 + + _rope_kernel_thd_cached_2c_bwd[grid]( + x, + y, + cos, + sin, + positions, + offsets, + out_x, + out_y, + *x.stride(), + *y.stride(), + *cos.stride(), + *positions.stride(), + *out_x.stride(), + *out_y.stride(), + t, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_T=BLOCK_T, + SPLIT_H_SIZE=SPLIT_H_SIZE, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + num_stages=num_stages, + ) + else: + # TODO check boundary + if rotate_style == RotateStyle.GPTJ and t >= 1024: + BLOCK_T = 32 + SPLIT_T = (triton.next_power_of_2(t) + BLOCK_T - 1) // BLOCK_T + QH_per_G = h // kh + grid = (kh, SPLIT_T, 1) + num_warps = 4 + waves_per_eu = 0 + num_stages = 2 if QH_per_G > 1 else 1 + + _rope_kernel_cached_thd_2c_gqa_bwd[grid]( + x, + y, + cos, + sin, + positions, + offsets, + out_x, + out_y, + *x.stride(), + *y.stride(), + *cos.stride(), + *positions.stride(), + *out_x.stride(), + *out_y.stride(), + t, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_T=BLOCK_T, + QH_per_G=QH_per_G, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + num_stages=num_stages, + ) + else: + BLOCK_T = min(max(triton.next_power_of_2(t), 16), 32) + SPLIT_T = (triton.next_power_of_2(t) + BLOCK_T - 1) // BLOCK_T + grid = (SPLIT_T, h, 1) + num_warps = 4 + waves_per_eu = 0 + _rope_kernel_cached_thd_2c_gqa_onehead_bwd[grid]( + x, + y, + cos, + sin, + positions, + offsets, + out_x, + out_y, + *x.stride(), + *y.stride(), + *cos.stride(), + *positions.stride(), + *out_x.stride(), + *out_y.stride(), + t, + HAVE_NOPE=have_nope, + NOPE_FIRST=nope_first, + INPLACE=inplace, + REUSE_FREQS_FRONT_PART=reuse_freqs_front_part, + IS_NEOX=(rotate_style == RotateStyle.NEOX), + HAVE_POS=(positions is not None), + HAVE_OFFS=(offsets is not None), + BLOCK_T=BLOCK_T, + G=kh, + BLOCK_D=BLOCK_D, + BLOCK_D_HALF=BLOCK_D_HALF, + num_warps=num_warps, + waves_per_eu=waves_per_eu, + ) + + return out_x, out_y + + +def rope_cached_thd_positions_2c_bwd( + x: torch.Tensor, + y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out_x = torch.empty(*x.shape, dtype=x.dtype, device=x.device, requires_grad=False) + out_y = torch.empty(*y.shape, dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_thd_positions_offsets_2c_bwd( + x, + y, + out_x, + out_y, + cos, + sin, + positions, + None, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out_x, out_y + + +def rope_cached_thd_positions_offsets_2c_bwd( + x: torch.Tensor, + y: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + positions: torch.Tensor, + offsets: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out_x = torch.empty(*x.shape, dtype=x.dtype, device=x.device, requires_grad=False) + out_y = torch.empty(*y.shape, dtype=x.dtype, device=x.device, requires_grad=False) + + _rope_cached_thd_positions_offsets_2c_bwd( + x, + y, + out_x, + out_y, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part, + nope_first, + False, + transpose_output, + ) + + return out_x, out_y + + +def _rope_fwd_2d( + x: torch.Tensor, + out: torch.Tensor, + cos_h: torch.Tensor, + sin_h: torch.Tensor, + cos_w: torch.Tensor, + sin_w: torch.Tensor, + img_height: torch.Tensor, + img_width: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + b, wh, h, d = x.shape + # out = torch.empty((b,wh,h,d), dtype=x.dtype, device=x.device, requires_grad=False) + + grid = (b, h, 1) + _rope_fwd_2d_kernel_neox[grid]( + x, + cos_h, + sin_h, + cos_w, + sin_w, + out, + *x.stride(), + *cos_h.stride(), + *cos_w.stride(), + wh, + img_height, + img_width, + BLOCK_D=d, + ) + + return out + + +def rope_fwd_2d( + x: torch.Tensor, + cos_h: torch.Tensor, + sin_h: torch.Tensor, + cos_w: torch.Tensor, + sin_w: torch.Tensor, + img_height: torch.Tensor, + img_width: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + b, wh, h, d = x.shape + out = torch.empty( + (b, wh, h, d), dtype=x.dtype, device=x.device, requires_grad=False + ) + + # grid = (b,h,1) + # _rope_fwd_2d_kernel_neox[grid](x, cos_h, sin_h, cos_w, sin_w, out, *x.stride(), *cos_h.stride(), *cos_w.stride(), wh, img_height, img_width, BLOCK_D=d) + + _rope_fwd_2d( + x, + out, + cos_h, + sin_h, + cos_w, + sin_w, + img_height, + img_width, + rotate_style, + reuse_freqs_front_part, + nope_first, + transpose_output, + ) + + return out + + +def rope_fwd_2d_inplace( + x: torch.Tensor, + cos_h: torch.Tensor, + sin_h: torch.Tensor, + cos_w: torch.Tensor, + sin_w: torch.Tensor, + img_height: torch.Tensor, + img_width: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, +): + out = x + _rope_fwd_2d( + x, + out, + cos_h, + sin_h, + cos_w, + sin_w, + img_height, + img_width, + rotate_style, + reuse_freqs_front_part, + nope_first, + transpose_output, + ) + + return out + + +class RoPE(autograd.Function): + @staticmethod + def forward( + ctx, + x: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, + ) -> torch.Tensor: + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.transpose_output = transpose_output + ctx.save_for_backward(freqs) + return rope_fwd( + x, freqs, rotate_style, reuse_freqs_front_part, nope_first, transpose_output + ) + + @staticmethod + def backward( + ctx, output_grads: torch.Tensor + ) -> Tuple[Union[torch.Tensor, None], ...]: + (freqs,) = ctx.saved_tensors + return ( + rope_bwd( + output_grads, + freqs, + ctx.rotate_style, + ctx.reuse_freqs_front_part, + ctx.nope_first, + ctx.transpose_output, + ), + None, + None, + ) + + +class RoPETHD(autograd.Function): + @staticmethod + def forward( + ctx, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + freqs: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + ): + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.save_for_backward(cu_seqlens, freqs) + return rope_thd_fwd( + x, cu_seqlens, freqs, rotate_style, reuse_freqs_front_part, nope_first + ) + + @staticmethod + def backward(ctx, output_grads) -> Tuple[Union[torch.Tensor, None], ...]: + cu_seqlens, freqs = ctx.saved_tensors + return ( + rope_thd_bwd( + output_grads, + cu_seqlens, + freqs, + ctx.rotate_style, + ctx.reuse_freqs_front_part, + ctx.nope_first, + ), + None, + None, + ) + + +class RoPECached(autograd.Function): + + @staticmethod + def forward( + ctx, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + transpose_output: bool = False, + ) -> torch.Tensor: + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.transpose_output = transpose_output + ctx.save_for_backward(cos, sin) + return rope_cached_fwd( + x, + cos, + sin, + rotate_style, + reuse_freqs_front_part, + nope_first, + transpose_output, + ) + + @staticmethod + def backward(ctx, output_grads) -> Tuple[Union[torch.Tensor, None], ...]: + cos, sin = ctx.saved_tensors + return ( + rope_cached_bwd( + output_grads, + cos, + sin, + ctx.rotate_style, + ctx.reuse_freqs_front_part, + ctx.nope_first, + ctx.transpose_output, + ), + None, + None, + ) + + +class RoPE2D(autograd.Function): + @staticmethod + def forward( + ctx, + x: torch.Tensor, + cos_height: torch.Tensor, + sin_height: torch.Tensor, + cos_width: torch.Tensor, + sin_width: torch.Tensor, + img_height: int, + img_width: int, + rotate_style: int, + reuse_freqs_front_part: bool, + nope_first: bool, + ) -> torch.Tensor: + ctx.img_height = img_height + ctx.img_width = img_width + ctx.rotate_style = rotate_style + ctx.reuse_freqs_front_part = reuse_freqs_front_part + ctx.nope_first = nope_first + ctx.save_for_backward(cos_height, sin_height, cos_width, sin_width) + return rope_fwd_2d( + x, + cos_height, + sin_height, + cos_width, + sin_width, + img_height, + img_width, + rotate_style, + reuse_freqs_front_part, + nope_first, + ) diff --git a/aiter/ops/triton/routing.py b/aiter/ops/triton/routing.py new file mode 100644 index 0000000000000000000000000000000000000000..17b4d2673be4131ec7f57b2d2c48264b6d607566 --- /dev/null +++ b/aiter/ops/triton/routing.py @@ -0,0 +1,267 @@ +import torch +import triton +import triton.language as tl + + +def get_config_heuristic(M, K, N): + """ + Return the best Triton configuration based on input dimensions. + + Args: + M: Batch dimension + K: Hidden dimension + N: Number of experts (16 or 128) + TOPK: Top-k value (default: 1) + + Returns: + triton.Config: Configuration for the Triton kernel + """ + # Determine M bucket (small: <2048, medium: 2048-4095, large: 4096-8191, very_large: 8192+) + m_bucket = ( + "very_large" + if M >= 8192 + else "large" if M >= 4096 else "medium" if M >= 2048 else "small" + ) + + # Create parameter configuration using nested dictionaries + configs = { + # Format: {N: {m_bucket: (BLOCK_M, BLOCK_K, num_warps, num_stages, waves_per_eu, kpack)}} + 16: { + "small": (16, 256, 4, 2, 3, 1), + "medium": (16, 256, 4, 2, 3, 1), + "large": (16, 256, 4, 2, 3, 2), + "very_large": (32, 256, 4, 2, 0, 1), + }, + 128: { + "small": (16, 256, 8, 1, 0, 1), + "medium": (16, 256, 8, 1, 0, 2), + "large": (16, 256, 8, 1, 2, 2), + "very_large": (32, 128, 8, 2, 2, 2), + }, + } + + # Get configuration parameters + BLOCK_M, BLOCK_K, num_warps, num_stages, waves_per_eu, kpack = configs[N][m_bucket] + + # Return Triton configuration + return triton.Config( + { + "BLOCK_M": BLOCK_M, + "BLOCK_K": BLOCK_K, + "matrix_instr_nonkdim": 16, # Always 16 + "waves_per_eu": waves_per_eu, + "kpack": kpack, + }, + num_warps=num_warps, + num_stages=num_stages, + num_ctas=1, + ) + + +# @triton.autotune( +# configs=[ +# triton.Config( +# { +# "BLOCK_M": bm, +# "BLOCK_K": bk, +# "matrix_instr_nonkdim": matrix_instr_nonkdim, +# "waves_per_eu": waves_per_eu, +# "kpack": kpack, +# }, +# num_warps=num_warps, +# num_stages=num_stages, +# ) +# for bm in [16, 32, 64] # [32, 64, 128, 256] +# for bk in [64, 128, 256] # [32, 64, 128, 256] +# for num_warps in [4, 8] # [4, 8] +# for matrix_instr_nonkdim in [16] +# for waves_per_eu in [0, 2, 3] # [0, 2, 3] +# for kpack in [1, 2] # [1, 2] +# for num_stages in [1, 2] # [1, 2] +# ], +# key=["M", "N", "K"], +# ) +@triton.jit +def _routing_sigmoid_top1_kernel( + X_ptr, + W_ptr, + topk_ids_ptr, + topk_weights_ptr, + M, + N, + K, + stride_xm, + stride_xk, + stride_wk, + stride_wn, + stride_topk_ids_m, + stride_topk_ids_n, + stride_topk_weights_m, + stride_topk_weights_n, + BLOCK_M: tl.constexpr, + BLOCK_K: tl.constexpr, + BLOCK_N: tl.constexpr, + TOPK: tl.constexpr, + FUSED_SHARED_EXPERTS: tl.constexpr, +): + # Program ID corresponds to the block index in M dimension + pid_m = tl.program_id(axis=0) + + # Offsets for the current block + offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_K) + + _TOPK: tl.constexpr = TOPK + 1 if FUSED_SHARED_EXPERTS else TOPK + + offs_topk = tl.arange(0, _TOPK) + + # Masks for bounds checking + mask_m = offs_m < M + mask_n = offs_n < N + + # Initialize accumulator for matmul (will be in float32 due to default acc_type) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + # Loop over K dimension in chunks of BLOCK_K + for k in range(0, K, BLOCK_K): + # Compute pointers for A and B + offs_k_iter = k + offs_k + mask_k = offs_k_iter < K + + X_ptrs = X_ptr + ( + # pyre-ignore + offs_m[:, None] * stride_xm + + offs_k_iter[None, :] * stride_xk + ) + W_ptrs = W_ptr + ( + offs_k_iter[:, None] * stride_wk + offs_n[None, :] * stride_wn + ) + + # Load A and B tiles + # pyre-ignore + x = tl.load(X_ptrs, mask=(mask_m[:, None] & mask_k[None, :]), other=0.0) + w = tl.load(W_ptrs, mask=(mask_k[:, None] & mask_n[None, :]), other=0.0) + + # Compute partial matmul for the current block using FP16 inputs and FP32 accumulation + acc = tl.dot(x, w, acc=acc) + + acc = tl.sigmoid(acc) + # Get topk results + topk_ids = tl.argmax(acc, axis=1, tie_break_left=True) # Shape: (BLOCK_M,) + topk_weights = tl.max(acc, axis=1) # Shape: (BLOCK_M,) + + # Create buffers for results + topk_ids_buffer = tl.zeros((BLOCK_M, _TOPK), dtype=tl.int32) + topk_weights_buffer = tl.zeros((BLOCK_M, _TOPK), dtype=tl.float32) + + if FUSED_SHARED_EXPERTS: + # Set the first column with broadcasting + topk_ids_buffer = tl.where( + (offs_topk[None, :] < _TOPK - 1), topk_ids[:, None], N + ) + topk_weights_buffer = tl.where( + (offs_topk[None, :] < _TOPK - 1), topk_weights[:, None], 1.0 + ) + else: + topk_ids_buffer = topk_ids[:, None] + topk_weights_buffer = topk_weights[:, None] + + topk_ids_ptrs = ( + topk_ids_ptr + + offs_m[:, None] * stride_topk_ids_m + + offs_topk[None, :] * stride_topk_ids_n + ) + + topk_weights_ptrs = ( + topk_weights_ptr + + offs_m[:, None] * stride_topk_weights_m + + offs_topk[None, :] * stride_topk_weights_n + ) + + tl.store(topk_ids_ptrs, topk_ids_buffer) + tl.store(topk_weights_ptrs, topk_weights_buffer) + + +def routing_sigmoid_top1(x, w, topk, fused_shared_experts=False): + # assert x.dtype == torch.bfloat16 + # assert w.dtype == torch.bfloat16 + x = x.view(-1, x.shape[-1]) + + assert topk == 1 + + # M: batch_size x seq_len, K: hidden_dim, N: num_experts + M, K = x.shape + Kb, N = w.shape + assert K == Kb + + _topk = topk + if fused_shared_experts: + _topk += 1 + + # Output tensor + topk_ids = torch.empty((M, _topk), device=x.device, dtype=torch.int32) + topk_weights = torch.empty((M, _topk), device=x.device, dtype=torch.float32) + + heuristc_config = get_config_heuristic(M, K, N) + + # Grid size + def grid(META): + return (triton.cdiv(M, META["BLOCK_M"]),) + + _routing_sigmoid_top1_kernel[grid]( + x, + w, + topk_ids, + topk_weights, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + topk_ids.stride(0), + topk_ids.stride(1), + topk_weights.stride(0), + topk_weights.stride(1), + BLOCK_N=N, # Set BLOCK_N to N (16) + TOPK=topk, + FUSED_SHARED_EXPERTS=fused_shared_experts, + num_warps=heuristc_config.num_warps, + num_stages=heuristc_config.num_stages, + num_ctas=heuristc_config.num_ctas, + **heuristc_config.kwargs, + ) + + return topk_ids, topk_weights + + +def torch_routing_sigmoid_top1( + x, w, topk, fused_shared_experts=False, dummy_ids=None, dummy_weights=None +): + scores = torch.matmul(x, w) # [M, N] + + scores = torch.sigmoid(scores.to(torch.float32)) # [M, N] + + assert topk == 1 + + topk_weights, topk_ids = torch.topk(scores, topk, dim=1) # [M, topk] + + topk_ids = topk_ids.to(torch.int32) + topk_weights = topk_weights.to(torch.float32) + + if fused_shared_experts: + topk_ids = torch.cat( + [ + topk_ids, + dummy_ids, + ], + dim=1, + ) + topk_weights = torch.cat( + [topk_weights, dummy_weights], + dim=1, + ) + + return topk_ids, topk_weights diff --git a/aiter/ops/triton/sage_attention.py b/aiter/ops/triton/sage_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..88432d55cfaa7bc9b0ebee724808000b14c84b3a --- /dev/null +++ b/aiter/ops/triton/sage_attention.py @@ -0,0 +1,205 @@ +import torch +import torch.nn.functional as F +from typing import Any, List, Literal, Optional, Tuple, Union + +from aiter.ops.triton.sage_attention_quant_per_block import per_block_int8 as per_block_int8_triton +from aiter.ops.triton.sage_attention_qk_int8_per_block_causal import forward as attn_true +from aiter.ops.triton.sage_attention_qk_int8_per_block import _get_config as _get_attn_false_config +from aiter.ops.triton.sage_attention_qk_int8_per_block import forward as attn_false + + +def do_quant_qk(q, k, BLKQ, BLKK, km, sm_scale, tensor_layout, quantization_backend): + if quantization_backend == "triton": + q_int8, q_scale, k_int8, k_scale = per_block_int8_triton(q, k, km=km, BLKQ=BLKQ, BLKK=BLKK, sm_scale=sm_scale, tensor_layout=tensor_layout) + else: + raise ValueError(f"Unsupported quantization backend: {quantization_backend}") + return q_int8, q_scale, k_int8, k_scale + + +def _get_attn_false_config_key(q, k, tensor_layout): + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + else: + raise ValueError(f"tensor_layout {tensor_layout} not supported") + + num_kv_groups = h_qo // h_kv + return str((qo_len, kv_len, h_qo, num_kv_groups)) + + +def sageattn_qk_int8_pv_fp16( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + tensor_layout: str = "HND", + quantization_backend: str = "triton", + is_causal: bool =False, + attn_mask: Optional[torch.Tensor] = None, + sm_scale: Optional[float] = None, + smooth_k: bool = True, + return_lse: bool = False, + **kwargs: Any, +) -> torch.Tensor: + """ + SageAttention with per-block INT8 quantization for Q and K, FP16 PV with FP16 accumulation, implemented using Triton. + The FP16 accumulator is added to a FP32 buffer immediately after each iteration. + + Parameters + ---------- + q : torch.Tensor + The query tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + k : torch.Tensor + The key tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + v : torch.Tensor + The value tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``. + + tensor_layout : str + The tensor layout, either "HND" or "NHD". + Default: "HND". + + quantization_backend : str + The quantization backend, either "triton" or "cuda". + "cuda" backend offers better performance due to kernel fusion. + + is_causal : bool + Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len. + Default: False. + + attn_mask : Optional[torch.Tensor] + The attention mask tensor, of dtype bool or float32. + Should be able to broadcast to the shape of the matrix qk^T. + Default: None. + + sm_scale : Optional[float] + The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``. + + smooth_k : bool + Whether to smooth the key tensor by subtracting the mean along the sequence dimension. + Default: True. + + return_lse : bool + Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention. + Default: False. + + Returns + ------- + torch.Tensor + The output tensor. Shape: + - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``. + - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``. + + torch.Tensor + The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor). + Shape: ``[batch_size, num_qo_heads, qo_len]``. + Only returned if `return_lse` is True. + + Note + ---- + - ``num_qo_heads`` must be divisible by ``num_kv_heads``. + - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16``, ``torch.bfloat16`` or ``torch.float32``. + - All tensors must be on the same cuda device. + - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances. + """ + + dtype = q.dtype + assert q.is_cuda, "Input tensors must be on cuda." + assert dtype in [torch.float16, torch.bfloat16], "Input tensors must be in dtype of torch.float16 or torch.bfloat16" + assert q.device == k.device == v.device, "All tensors must be on the same device." + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype." + + if attn_mask is not None: + assert attn_mask.dtype == torch.bool or attn_mask.dtype == q.dtype, "attn_mask must be of dtype bool or the same dtype as q." + assert attn_mask.device == q.device, "All tensors must be on the same device." + + # FIXME(DefTruth): make sage attention work compatible with distributed + # env, for example, xDiT which launch by torchrun. Without this workaround, + # sage attention will run into illegal memory access error after first + # inference step in distributed env for multi gpus inference. This small + # workaround also make sage attention work compatible with torch.compile + # through non-fullgraph compile mode. + torch.cuda.set_device(v.device) + + head_dim_og = q.size(-1) + + if head_dim_og < 64: + q = torch.nn.functional.pad(q, (0, 64 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 64 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 64 - head_dim_og)) + elif head_dim_og > 64 and head_dim_og < 128: + q = torch.nn.functional.pad(q, (0, 128 - head_dim_og)) + k = torch.nn.functional.pad(k, (0, 128 - head_dim_og)) + v = torch.nn.functional.pad(v, (0, 128 - head_dim_og)) + elif head_dim_og > 128: + raise ValueError(f"Unsupported head_dim: {head_dim_og}") + + # assert last dim is contiguous + assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, "Last dim of qkv must be contiguous." + + seq_dim = 1 if tensor_layout == "NHD" else 2 + nh_dim = 2 if tensor_layout == 0 else 1 + + if smooth_k: + km = k.mean(dim=seq_dim, keepdim=True) + nqheads = q.size(2) + nkheads = k.size(2) + q_per_kv_heads = nqheads // nkheads + if q_per_kv_heads > 1: + # nheads_k => nheads_q + km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim) + else: + km_broadcast = km + if return_lse: + if tensor_layout == "NHD": + lse_correction = torch.matmul(q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)).squeeze(-1).to(torch.float32) + else: + lse_correction = torch.matmul(q, km_broadcast.transpose(2, 3)).squeeze(-1).to(torch.float32) + else: + km = None + + if dtype == torch.bfloat16 or dtype == torch.float32: + v = v.to(torch.float16) + + if sm_scale is None: + sm_scale = 1.0 / (head_dim_og ** 0.5) + + if is_causal: + assert attn_mask is None, "Mask should be None for causal attention." + o, lse = attn_true(q_int8, k_int8, v, q_scale, k_scale, tensor_layout=tensor_layout, output_dtype=dtype, return_lse=return_lse) + else: + if attn_mask is not None: + if tensor_layout == "HND": + target_shape = (q.shape[0], q.shape[1], q.shape[2], k.shape[2]) + elif tensor_layout == "NHD": + target_shape = (q.shape[0], q.shape[2], q.shape[1], k.shape[1]) + else: + raise ValueError(f"tensor_layout {tensor_layout} not supported") + try: + attn_mask = attn_mask.expand(target_shape) + except Exception: + raise AssertionError(f"attn_mask shape {attn_mask.shape} cannot be broadcast to {target_shape}") + + key = _get_attn_false_config_key(q, k, tensor_layout) + config = _get_attn_false_config(key, q.shape[-1]) + + q_int8, q_scale, k_int8, k_scale = do_quant_qk(q, k, BLKQ=config['BLOCK_M'], BLKK=config['BLOCK_N'], km=km, sm_scale=sm_scale, tensor_layout=tensor_layout, quantization_backend=quantization_backend) + + o, lse = attn_false(q_int8, k_int8, v, q_scale, k_scale, tensor_layout=tensor_layout, output_dtype=dtype, attn_mask=attn_mask, return_lse=return_lse, config=config) + + o = o[..., :head_dim_og] + + if return_lse: + return o, lse / 1.44269504 + lse_correction * sm_scale if smooth_k else lse / 1.44269504 + else: + return o + diff --git a/aiter/ops/triton/sage_attention_qk_int8_per_block.py b/aiter/ops/triton/sage_attention_qk_int8_per_block.py new file mode 100644 index 0000000000000000000000000000000000000000..82db5ef410fb51eae54fecfbe01e8a2369ebb664 --- /dev/null +++ b/aiter/ops/triton/sage_attention_qk_int8_per_block.py @@ -0,0 +1,242 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import torch, math +import triton +import triton.language as tl +from triton.utils.hcutuner import get_gpu_label +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import functools + +@triton.jit +def _attn_fwd_inner(acc, l_i, m_i, q, q_scale, qo_len, kv_len, + K_ptrs, K_scale_ptr, V_ptrs, stride_kn, stride_vn, + start_m, mask_ptrs, stride_maskn, + BLOCK_M: tl.constexpr, HEAD_DIM: tl.constexpr, BLOCK_N: tl.constexpr, + STAGE: tl.constexpr, offs_m: tl.constexpr, offs_n: tl.constexpr, + ): + lo, hi = 0, kv_len + for start_n in range(lo, hi, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + mask_block = None + skip = False + if mask_ptrs is not None: + if mask_ptrs.dtype.element_ty == tl.int1: + mask_block = tl.load(mask_ptrs + start_n * stride_maskn, mask=(offs_m[:, None] < qo_len) & (offs_n[None, :] < kv_len - start_n), other=False) + if tl.max(mask_block) == 0: + skip = True + else: + mask_block = tl.load(mask_ptrs + start_n * stride_maskn, mask=(offs_m[:, None] < qo_len) & (offs_n[None, :] < kv_len - start_n), other=-1.0e6) + if not skip: + k_mask = offs_n[None, :] < (kv_len - start_n) + k = tl.load(K_ptrs, mask=k_mask) + k_scale = tl.load(K_scale_ptr) + + qk = tl.dot(q, k).to(tl.float32) * (q_scale * k_scale) + + if mask_block is not None: + if mask_block.dtype == tl.int1: + qk = qk + tl.where(mask_block, 0, -1.0e6) + else: + qk = qk + mask_block + else: + qk += tl.where(k_mask, 0, -1.0e6) + + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk = qk - m_ij[:, None] + p = tl.math.exp2(qk) + l_ij = tl.sum(p, 1) + + alpha = tl.math.exp2(m_i - m_ij) + l_i = l_i * alpha + l_ij + + acc = acc * alpha[:, None] + + v = tl.load(V_ptrs, mask = offs_n[:, None] < (kv_len - start_n)) + p = p.to(tl.float16) + + acc += tl.dot(p, v, out_dtype=tl.float16) + m_i = m_ij + K_ptrs += BLOCK_N * stride_kn + K_scale_ptr += 1 + V_ptrs += BLOCK_N * stride_vn + return acc, l_i, m_i + +@triton.jit +def _attn_fwd(Q, K, V, Q_scale, K_scale, Out, mask, Lse, + stride_qz, stride_qh, stride_qn, + stride_kz, stride_kh, stride_kn, + stride_vz, stride_vh, stride_vn, + stride_oz, stride_oh, stride_on, + stride_maskz, stride_maskh, stride_maskm, stride_maskn, + qo_len, kv_len, H: tl.constexpr, num_kv_groups: tl.constexpr, + HEAD_DIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + STAGE: tl.constexpr, + RETURN_LSE: tl.constexpr, + ): + tl.assume(stride_qz > 0) + tl.assume(stride_qh > 0) + tl.assume(stride_qn > 0) + tl.assume(stride_kz > 0) + tl.assume(stride_kh > 0) + tl.assume(stride_kn > 0) + tl.assume(stride_vz > 0) + tl.assume(stride_vh > 0) + tl.assume(stride_vn > 0) + tl.assume(stride_oz > 0) + tl.assume(stride_oh > 0) + tl.assume(stride_on > 0) + tl.assume(stride_maskz > 0) + tl.assume(stride_maskh > 0) + tl.assume(stride_maskm > 0) + tl.assume(stride_maskn > 0) + tl.assume(qo_len > 0) + tl.assume(kv_len > 0) + tl.assume(num_kv_groups > 0) + + start_m = tl.program_id(0) + + off_z = tl.program_id(2).to(tl.int64) + off_h = tl.program_id(1).to(tl.int64) + + q_scale_offset = (off_z * H + off_h) * tl.cdiv(qo_len, BLOCK_M) + k_scale_offset = (off_z * (H // num_kv_groups) + off_h // num_kv_groups) * tl.cdiv(kv_len, BLOCK_N) + + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, HEAD_DIM) + Q_ptrs = Q + (off_z * stride_qz + off_h * stride_qh) + offs_m[:, None] * stride_qn + offs_k[None, :] + Q_scale_ptr = Q_scale + q_scale_offset + start_m + K_ptrs = K + (off_z * stride_kz + (off_h // num_kv_groups) * stride_kh) + offs_n[None, :] * stride_kn + offs_k[:, None] + K_scale_ptr = K_scale + k_scale_offset + V_ptrs = V + (off_z * stride_vz + (off_h // num_kv_groups) * stride_vh) + offs_n[:, None] * stride_vn + offs_k[None, :] + O_block_ptr = Out + (off_z * stride_oz + off_h * stride_oh) + offs_m[:, None] * stride_on + offs_k[None, :] + if mask is None: + mask_ptrs = None + else: + mask_ptrs = mask + (off_z * stride_maskz + off_h * stride_maskh) + offs_m[:, None] * stride_maskm + offs_n[None, :] * stride_maskn + + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) + + q = tl.load(Q_ptrs, mask = offs_m[:, None] < qo_len) + q_scale = tl.load(Q_scale_ptr) + acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, q_scale, qo_len, kv_len, K_ptrs, K_scale_ptr, V_ptrs, stride_kn, stride_vn, + start_m, mask_ptrs, stride_maskn, + BLOCK_M, HEAD_DIM, BLOCK_N, + 4 - STAGE, offs_m, offs_n + ) + acc = acc / l_i[:, None] + tl.store(O_block_ptr, acc.to(Out.type.element_ty), mask = (offs_m[:, None] < qo_len)) + + if RETURN_LSE: + lse_ptrs = Lse + (off_z * qo_len * H + off_h * qo_len) + offs_m + l_i = tl.log2(l_i) + m_i + tl.store(lse_ptrs, l_i, mask = (offs_m < qo_len)) + + +@functools.lru_cache(maxsize=1024) +def _get_config(key, head_dim): + if not hasattr(_get_config, "_config_dict"): + try: + config_path = f"{AITER_TRITON_CONFIGS_PATH}/sage_attention/_attn_fwd-device={get_gpu_label()}-dtype=f16_f16_f16_f32_f32_f16_f32.json" + print(f"config_path={config_path}") + with open(config_path, "r") as file: + config = json.load(file) + except Exception as e: + print(e) + config = {'config': {}} + _get_config._config_dict = config + + config = _get_config._config_dict["config"] + if key not in config: + default_config = { + "BLOCK_M": 128, + "BLOCK_N": 64, + "STAGE": 1, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2, + "num_warps": 4 if head_dim == 64 else 8, + "num_stages": 2 + } + print(f"WARNING: optimal config {key} not found for _attn_fwd, use default config: {default_config}") + return default_config + else: + return config[key] + + +def forward(q, k, v, q_scale, k_scale, tensor_layout="HND", attn_mask=None, output_dtype=torch.float16, return_lse=False, config=None): + o = torch.empty(q.shape, dtype=output_dtype, device=q.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_v, stride_h_v, stride_seq_v = v.stride(0), v.stride(1), v.stride(2) + stride_bz_o, stride_h_o, stride_seq_o = o.stride(0), o.stride(1), o.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_v, stride_h_v, stride_seq_v = v.stride(0), v.stride(2), v.stride(1) + stride_bz_o, stride_h_o, stride_seq_o = o.stride(0), o.stride(2), o.stride(1) + else: + raise ValueError(f"tensor_layout {tensor_layout} not supported") + + if attn_mask is not None: + stride_bz_mask, stride_h_mask, stride_m_mask, stride_n_mask = attn_mask.stride(0), attn_mask.stride(1), attn_mask.stride(2), attn_mask.stride(3) + else: + stride_bz_mask, stride_h_mask, stride_m_mask, stride_n_mask = 0, 0, 0, 0 + + HEAD_DIM_K = head_dim + num_kv_groups = h_qo // h_kv + + if return_lse: + lse = torch.empty([b, h_qo, qo_len], dtype=torch.float32, device=q.device) + else: + lse = torch.empty([0], dtype=torch.float32, device='cpu') + + grid = lambda META: (triton.cdiv(qo_len, META['BLOCK_M']), h_qo, b) + + if not config: + key = str((qo_len, kv_len, h_qo, num_kv_groups)) + config = _get_config(key, head_dim) + assert config is not None, "ERROR: optimal config not found" + + _attn_fwd[grid]( + q, k, v, q_scale, k_scale, o, attn_mask, lse, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_v, stride_h_v, stride_seq_v, + stride_bz_o, stride_h_o, stride_seq_o, + stride_bz_mask, stride_h_mask, stride_m_mask, stride_n_mask, + qo_len, kv_len, + h_qo, num_kv_groups, + HEAD_DIM=HEAD_DIM_K, + RETURN_LSE=return_lse, + **config + ) + + return o, lse diff --git a/aiter/ops/triton/sage_attention_qk_int8_per_block_causal.py b/aiter/ops/triton/sage_attention_qk_int8_per_block_causal.py new file mode 100644 index 0000000000000000000000000000000000000000..914376f40f185b37ca3bb1d771862aaa78cf2fbf --- /dev/null +++ b/aiter/ops/triton/sage_attention_qk_int8_per_block_causal.py @@ -0,0 +1,228 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import torch, math +import triton +import triton.language as tl +from triton.utils.hcutuner import get_gpu_label +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import functools + +@triton.jit +def _attn_fwd_inner(acc, l_i, m_i, q, q_scale, kv_len, + K_ptrs, K_scale_ptr, V_ptrs, stride_kn, stride_vn, + start_m, + BLOCK_M: tl.constexpr, HEAD_DIM: tl.constexpr, BLOCK_N: tl.constexpr, + STAGE: tl.constexpr, offs_m: tl.constexpr, offs_n: tl.constexpr, + ): + if STAGE == 1: + lo, hi = 0, start_m * BLOCK_M + elif STAGE == 2: + lo, hi = start_m * BLOCK_M, (start_m + 1) * BLOCK_M + lo = tl.multiple_of(lo, BLOCK_M) + K_scale_ptr += lo // BLOCK_N + K_ptrs += stride_kn * lo + V_ptrs += stride_vn * lo + for start_n in range(lo, hi, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + k_mask = offs_n[None, :] < (kv_len - start_n) + k = tl.load(K_ptrs, mask = k_mask) + k_scale = tl.load(K_scale_ptr) + qk = tl.dot(q, k).to(tl.float32) * (q_scale * k_scale) + + mask = k_mask + if STAGE == 2: + mask &= offs_m[:, None] >= (start_n + offs_n[None, :]) + qk += tl.where(mask, 0, float('-inf')) + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk -= m_ij[:, None] + + p = tl.math.exp2(qk) + l_ij = tl.sum(p, 1) + + alpha = tl.math.exp2(m_i - m_ij) + l_i = l_i * alpha + l_ij + + acc = acc * alpha[:, None] + + v = tl.load(V_ptrs, mask = offs_n[:, None] < (kv_len - start_n)) + p = p.to(tl.float16) + + acc += tl.dot(p, v, out_dtype=tl.float16) + m_i = m_ij + K_ptrs += BLOCK_N * stride_kn + K_scale_ptr += 1 + V_ptrs += BLOCK_N * stride_vn + return acc, l_i, m_i + +@triton.jit +def _attn_causal_fwd(Q, K, V, Q_scale, K_scale, Out, Lse, + stride_qz, stride_qh, stride_qn, + stride_kz, stride_kh, stride_kn, + stride_vz, stride_vh, stride_vn, + stride_oz, stride_oh, stride_on, + qo_len, kv_len, H:tl.constexpr, num_kv_groups:tl.constexpr, + HEAD_DIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + STAGE: tl.constexpr, + RETURN_LSE: tl.constexpr, + ): + tl.assume(stride_qz > 0) + tl.assume(stride_qh > 0) + tl.assume(stride_qn > 0) + tl.assume(stride_kz > 0) + tl.assume(stride_kh > 0) + tl.assume(stride_kn > 0) + tl.assume(stride_vz > 0) + tl.assume(stride_vh > 0) + tl.assume(stride_vn > 0) + tl.assume(stride_oz > 0) + tl.assume(stride_oh > 0) + tl.assume(stride_on > 0) + tl.assume(qo_len > 0) + tl.assume(kv_len > 0) + + start_m = tl.program_id(0) + + off_z = tl.program_id(2).to(tl.int64) + off_h = tl.program_id(1).to(tl.int64) + + q_scale_offset = (off_z * H + off_h) * tl.cdiv(qo_len, BLOCK_M) + k_scale_offset = (off_z * (H // num_kv_groups) + off_h // num_kv_groups) * tl.cdiv(kv_len, BLOCK_N) + + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, HEAD_DIM) + Q_ptrs = Q + (off_z * stride_qz + off_h * stride_qh) + offs_m[:, None] * stride_qn + offs_k[None, :] + Q_scale_ptr = Q_scale + q_scale_offset + start_m + K_ptrs = K + (off_z * stride_kz + (off_h // num_kv_groups) * stride_kh) + offs_n[None, :] * stride_kn + offs_k[:, None] + K_scale_ptr = K_scale + k_scale_offset + V_ptrs = V + (off_z * stride_vz + (off_h // num_kv_groups) * stride_vh) + offs_n[:, None] * stride_vn + offs_k[None, :] + O_block_ptr = Out + (off_z * stride_oz + off_h * stride_oh) + offs_m[:, None] * stride_on + offs_k[None, :] + + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) + + q = tl.load(Q_ptrs, mask = offs_m[:, None] < qo_len) + q_scale = tl.load(Q_scale_ptr) + acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, q_scale, kv_len, K_ptrs, K_scale_ptr, V_ptrs, stride_kn, stride_vn, + start_m, + BLOCK_M, HEAD_DIM, BLOCK_N, + 4 - STAGE, offs_m, offs_n + ) + + acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, q_scale, kv_len, K_ptrs, K_scale_ptr, V_ptrs, stride_kn, stride_vn, + start_m, + BLOCK_M, HEAD_DIM, BLOCK_N, + 2, offs_m, offs_n + ) + acc = acc / l_i[:, None] + tl.store(O_block_ptr, acc.to(Out.type.element_ty), mask = (offs_m[:, None] < qo_len)) + + if RETURN_LSE: + lse_ptrs = Lse + (off_z * qo_len * H + off_h * qo_len) + offs_m + l_i = tl.log2(l_i) + m_i + tl.store(lse_ptrs, l_i, mask = (offs_m < qo_len)) + + +@functools.lru_cache(maxsize=1024) +def _get_config(key, head_dim): + if not hasattr(_get_config, "_config_dict"): + try: + config_path = f"{AITER_TRITON_CONFIGS_PATH}/_attn_causal_fwd-device={get_gpu_label()}-dtype=f16_f16_f16_f16_f32.json" + print(f"config_path={config_path}") + with open(config_path, "r") as file: + config = json.load(file) + except Exception as e: + print(e) + config = {'config': {}} + _get_config._config_dict = config + + config = _get_config._config_dict["config"] + if key not in config: + default_config = { + "BLOCK_M": 128, + "BLOCK_N": 64, + "STAGE": 1, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2, + "num_warps": 4 if head_dim == 64 else 8, + "num_stages": 2 + } + print(f"WARNING: optimal config {key} not found for _attn_fwd, use default config: {default_config}") + return default_config + else: + return config[key] + + +def forward(q, k, v, q_scale, k_scale, tensor_layout="HND", output_dtype=torch.float16, return_lse=False): + o = torch.empty(q.shape, dtype=output_dtype, device=q.device) + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_v, stride_h_v, stride_seq_v = v.stride(0), v.stride(1), v.stride(2) + stride_bz_o, stride_h_o, stride_seq_o = o.stride(0), o.stride(1), o.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_v, stride_h_v, stride_seq_v = v.stride(0), v.stride(2), v.stride(1) + stride_bz_o, stride_h_o, stride_seq_o = o.stride(0), o.stride(2), o.stride(1) + else: + raise ValueError(f"tensor_layout {tensor_layout} not supported") + + assert qo_len == kv_len, "qo_len and kv_len must be equal for causal attention" + + HEAD_DIM_K = head_dim + num_kv_groups = h_qo // h_kv + + if return_lse: + lse = torch.empty([b, h_qo, qo_len], dtype=torch.float32, device=q.device) + else: + lse = torch.empty([0], dtype=torch.float32, device='cpu') + + grid = lambda META: (triton.cdiv(qo_len, META['BLOCK_M']), h_qo, b ) + + keys = [qo_len, kv_len, h_qo, num_kv_groups] + config, path = _get_config(*keys, head_dim) + assert config is not None, "ERROR: optimal config not found" + + fn = _attn_causal_fwd[grid] if not has_kernel_cache(path) else \ + functools.partial(run_saved_kernel, _attn_causal_fwd, path, grid=grid) + + fn( + q, k, v, q_scale, k_scale, o, lse, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_v, stride_h_v, stride_seq_v, + stride_bz_o, stride_h_o, stride_seq_o, + qo_len, kv_len, + h_qo, num_kv_groups, + HEAD_DIM=HEAD_DIM_K, + RETURN_LSE=return_lse, + **config + ) + + return o, lse diff --git a/aiter/ops/triton/sage_attention_quant_per_block.py b/aiter/ops/triton/sage_attention_quant_per_block.py new file mode 100644 index 0000000000000000000000000000000000000000..7157ee8104accf9688ea9c9dcedd865e69c1597d --- /dev/null +++ b/aiter/ops/triton/sage_attention_quant_per_block.py @@ -0,0 +1,157 @@ +""" +Copyright (c) 2024 by SageAttention team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import torch +import triton +import triton.language as tl +from triton.utils.hcutuner import get_gpu_label +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +import functools + + +@triton.jit +def quant_per_block_int8_kernel(Input, Output, Scale, L, + stride_iz, stride_ih, stride_in, + stride_oz, stride_oh, stride_on, + stride_sz, stride_sh, + sm_scale, + C: tl.constexpr, BLK: tl.constexpr): + tl.assume(L > 0) + tl.assume(stride_iz > 0) + tl.assume(stride_ih > 0) + tl.assume(stride_in > 0) + tl.assume(stride_oz > 0) + tl.assume(stride_oh > 0) + tl.assume(stride_on > 0) + tl.assume(stride_sz > 0) + tl.assume(stride_sh > 0) + + off_blk = tl.program_id(0) + off_h = tl.program_id(1) + off_b = tl.program_id(2) + + offs_n = off_blk * BLK + tl.arange(0, BLK) + offs_k = tl.arange(0, C) + + input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :] + output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :] + scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk + + x = tl.load(input_ptrs, mask=offs_n[:, None] < L) + x = x.to(tl.float32) + x *= sm_scale + scale = tl.max(tl.abs(x)) / 127. + x_int8 = x / scale + x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1) + x_int8 = x_int8.to(tl.int8) + tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L) + tl.store(scale_ptrs, scale) + + +@functools.lru_cache(maxsize=1024) +def _get_config(key, blk): + if not hasattr(_get_config, "_config_dict"): + try: + config_path = f"{AITER_TRITON_CONFIGS_PATH}/sage_attention/quant_per_block_int8_kernel-device={get_gpu_label()}-dtype=f16_i8_f32.json" + print(f"config_path={config_path}") + with open(config_path, "r") as file: + config = json.load(file) + except Exception as e: + print(e) + config = {'config': {}} + _get_config._config_dict = config + + config = _get_config._config_dict["config"] + if key not in config: + default_config = { + "num_warps": 4, + "num_stages": 2 + } + print(f"WARNING: optimal config {key} not found for quant_per_block_int8_kernel, use default config: {default_config}") + return default_config + else: + return config[key] + + +def per_block_int8(q, k, km=None, BLKQ=128, BLKK=64, sm_scale=None, tensor_layout="HND"): + q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device) + k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device) + + if km is not None: + k = k - km + + if tensor_layout == "HND": + b, h_qo, qo_len, head_dim = q.shape + _, h_kv, kv_len, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2) + elif tensor_layout == "NHD": + b, qo_len, h_qo, head_dim = q.shape + _, kv_len, h_kv, _ = k.shape + + stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1) + stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1) + stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1) + stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1) + else: + raise ValueError(f"Unknown tensor layout: {tensor_layout}") + + if sm_scale is None: + sm_scale = head_dim**-0.5 + + grid = ((qo_len + BLKQ - 1) // BLKQ, h_qo, b) + + keys = str((qo_len, head_dim, BLKQ)) + config = _get_config(keys, BLKQ) + assert config is not None, "ERROR: optimal config not found" + + q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ), + device=q.device, dtype=torch.float32) + + quant_per_block_int8_kernel[grid]( + q, q_int8, q_scale, qo_len, + stride_bz_q, stride_h_q, stride_seq_q, + stride_bz_qo, stride_h_qo, stride_seq_qo, + q_scale.stride(0), q_scale.stride(1), + sm_scale=(sm_scale * 1.44269504), + C=head_dim, BLK=BLKQ, + **config + ) + + grid = ((kv_len + BLKK - 1) // BLKK, h_kv, b) + + keys = str((kv_len, head_dim, BLKK)) + config = _get_config(keys, BLKK) + assert config is not None, "ERROR: optimal config not found" + + k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK), + device=q.device, dtype=torch.float32) + + quant_per_block_int8_kernel[grid]( + k, k_int8, k_scale, kv_len, + stride_bz_k, stride_h_k, stride_seq_k, + stride_bz_ko, stride_h_ko, stride_seq_ko, + k_scale.stride(0), k_scale.stride(1), + sm_scale=1.0, + C=head_dim, BLK=BLKK, + **config + ) + + return q_int8, q_scale, k_int8, k_scale diff --git a/aiter/ops/triton/softmax.py b/aiter/ops/triton/softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..37719a5d109b6e6f21907d9ca11a13e07f7ad2f8 --- /dev/null +++ b/aiter/ops/triton/softmax.py @@ -0,0 +1,95 @@ +import torch +import triton +import triton.language as tl + + +@triton.jit +def _softmax_kernel_online( + output_ptr, + input_ptr, + input_row_stride, + output_row_stride, + n_rows, + n_cols, + BLOCK_SIZE: tl.constexpr, +): + + row_start = tl.program_id(0) + row_idx = row_start + + # loop 1, find max and sum + m = -float("inf") # Initial value of max + row_sum = 0.0 + row_start_ptr = input_ptr + row_idx * input_row_stride + for b in tl.range(0, n_cols, BLOCK_SIZE): + col_offsets = b + tl.arange(0, BLOCK_SIZE) + input_ptrs = row_start_ptr + col_offsets + mask = col_offsets < n_cols + row_block = tl.load( + input_ptrs, mask=mask, other=-float("inf"), cache_modifier=".cg" + ) # load block + m_p = tl.max(row_block, axis=0) # find block max + m_p = tl.maximum(m, m_p) # Find new max across all blocks so far + row_sum = row_sum * tl.exp(m - m_p) # Adjust previous sum + row_sum += tl.sum( + tl.exp(row_block - m_p) + ) # Add to exponentiated sum of this block + m = m_p # save max + + output_row_start_ptr = output_ptr + row_idx * output_row_stride + # Loop 2 + for b in tl.range(0, n_cols, BLOCK_SIZE): + col_offsets = b + tl.arange(0, BLOCK_SIZE) + input_ptrs = row_start_ptr + col_offsets + mask = col_offsets < n_cols + row_block = tl.load( + input_ptrs, mask=mask, other=-float("inf"), cache_modifier=".cg" + ) # load block + # subtract, exponentiate and divide by sum + softmax_output = tl.exp(row_block - m) / row_sum + # store + output_ptrs = output_row_start_ptr + col_offsets + tl.store(output_ptrs, softmax_output, mask=mask) + + +def softmax(x): + """ + Computes the row-wise softmax of a 2D input tensor. + + Key parameters: + x (torch.Tensor): A 2D input tensor. + + Returns: + torch.Tensor: A tensor of the same shape as 'x', where softmax has been + applied along the last dimension (row-wise). + + Note: + - The input tensor 'x' must reside on the GPU. + """ + n_rows, n_cols = x.shape + + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(n_cols)) + y = torch.empty_like(x) + + waves_per_eu = 2 + num_warps = 8 + num_stages = 2 + + num_programs = n_rows + + grid = lambda meta: (num_programs,) # noqa: E731 + _softmax_kernel_online[grid]( + y, + x, + x.stride(0), + y.stride(0), + n_rows, + n_cols, + BLOCK_SIZE, + waves_per_eu=waves_per_eu, + num_warps=num_warps, + num_stages=num_stages, + ) + + return y diff --git a/aiter/ops/triton/topk.py b/aiter/ops/triton/topk.py new file mode 100644 index 0000000000000000000000000000000000000000..57ef30c11535ea6bc44a9247239aae3df312e16b --- /dev/null +++ b/aiter/ops/triton/topk.py @@ -0,0 +1,407 @@ +# SPDX-License-Identifier: MIT + + +# The kernel in this file is adapted from FlagGems' topk: +# https://github.com/FlagOpen/FlagGems/blob/master/src/flag_gems/ops/topk.py + +# Top-K on GPU: 1-stage (tiny rows) + 2-stage (large rows) Triton kernels, +from __future__ import annotations +from typing import Tuple +import math +import torch +import triton +import triton.language as tl +import triton.language.core as core +from triton.language.standard import _log2, zeros_like + + +# 1-STAGE KERNEL (tiny rows) +@triton.jit +def _topk_kernel( + X, + OUT_V, + OUT_I, + stride_xm, + stride_ovm, + stride_oim, + M: tl.constexpr, + K: tl.constexpr, + BLOCK: tl.constexpr, +): + pid = tl.program_id(0) + row_ptr = X + pid * stride_xm + offs = tl.arange(0, BLOCK) + mask = offs < M + FILL_VALUE = tl.constexpr(torch.finfo(torch.float32).min) + vals = tl.load(row_ptr + offs, mask=mask, other=FILL_VALUE).to(tl.float32) + idxs = offs.to(tl.int64) + + out_v_ptr = OUT_V + pid * stride_ovm + out_i_ptr = OUT_I + pid * stride_oim + + # unrolled exactly K iterations -- no break/continue needed + for j in core.static_range(0, K): + vmax = tl.max(vals, axis=0) + eq = vals == vmax + big = tl.where( + eq, tl.zeros_like(idxs), tl.zeros_like(idxs) + BLOCK + ) # BLOCK as int64 + arg = tl.min(idxs + big, axis=0) + + tl.store(out_v_ptr + j, vmax) + tl.store(out_i_ptr + j, arg) + + vals = tl.where(idxs == arg, FILL_VALUE, vals) + + +def _pick_block(m: int, k: int) -> int: + blk = max(128, k) + while blk < m and blk < 1024: + blk <<= 1 + return blk + + +def one_stage_topk( + x: torch.Tensor, + k: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + B, M = x.shape + BLOCK = _pick_block(M, k) + if M > BLOCK or BLOCK > 1024: + raise ValueError("row length too large for this kernel (<=1024)") + + out_v = torch.empty((B, k), device=x.device, dtype=x.dtype) + out_i = torch.empty((B, k), device=x.device, dtype=torch.int64) + + _topk_kernel[(B,)]( + x.contiguous(), + out_v, + out_i, + x.stride(0), + out_v.stride(0), + out_i.stride(0), + M=M, + K=k, + BLOCK=BLOCK, + num_warps=4, + num_stages=2, + ) + return out_v, out_i + + +# 2-STAGE KERNEL (large rows) +@triton.jit +def topk_stage1_kernel( + y_ptr, + index_ptr, + x_ptr, + k, + N: tl.constexpr, + CHUNK_SIZE: tl.constexpr, + DESCENDING: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_chunk_idx = tl.program_id(1) + chunk_num = tl.num_programs(1) + + y_ptr += cur_batch * chunk_num * k + cur_chunk_idx * k + index_ptr += cur_batch * chunk_num * k + cur_chunk_idx * k + + chunk_offset = cur_chunk_idx * CHUNK_SIZE + x_ptr += cur_batch * N + chunk_offset + + cols = tl.arange(0, CHUNK_SIZE) + mask = (chunk_offset + cols) < N + + FILL_VALUE = tl.constexpr( + torch.finfo(torch.float32).min if DESCENDING else torch.finfo(torch.float32).max + ) + x_val = tl.load(x_ptr + cols, mask=mask, other=FILL_VALUE).to(tl.float32) + for k_idx in range(k): + if DESCENDING: + chunk_select_val, chunk_select_idx = tl.max( + x_val, axis=0, return_indices=True + ) + else: + chunk_select_val, chunk_select_idx = tl.min( + x_val, axis=0, return_indices=True + ) + + tl.store(y_ptr + k_idx, chunk_select_val) + tl.store(index_ptr + k_idx, chunk_select_idx + chunk_offset) + + if DESCENDING: + x_val = tl.where( + cols == chunk_select_idx, + tl.constexpr(torch.finfo(torch.float32).min), + x_val, + ) + else: + x_val = tl.where( + cols == chunk_select_idx, + tl.constexpr(torch.finfo(torch.float32).max), + x_val, + ) + + +@triton.jit +def _compare_and_swap(x, ids, flip, i: core.constexpr, n_dims: core.constexpr): + n_outer: core.constexpr = x.numel >> n_dims + shape: core.constexpr = [n_outer * 2**i, 2, 2 ** (n_dims - i - 1)] + + y = core.reshape(x, shape) + y_idx = core.reshape(ids, shape) + + # slice left/right with 'stride' 2**(n_dims - i - 1) + mask = core.arange(0, 2)[None, :, None] + left = core.broadcast_to(tl.sum(y * (1 - mask), 1)[:, None, :], shape).to(x.dtype) + right = core.broadcast_to(tl.sum(y * mask, 1)[:, None, :], shape).to(x.dtype) + left = core.reshape(left, x.shape) + right = core.reshape(right, x.shape) + + left_idx = core.broadcast_to(tl.sum(y_idx * (1 - mask), 1)[:, None, :], shape).to( + ids.dtype + ) + right_idx = core.broadcast_to(tl.sum(y_idx * mask, 1)[:, None, :], shape).to( + ids.dtype + ) + left_idx = core.reshape(left_idx, ids.shape) + right_idx = core.reshape(right_idx, ids.shape) + + # actual compare-and-swap + if core.constexpr(x.dtype.primitive_bitwidth) == 8: + idtype = core.int8 + elif core.constexpr(x.dtype.primitive_bitwidth) == 16: + idtype = core.int16 + elif core.constexpr(x.dtype.primitive_bitwidth) == 32: + idtype = core.int32 + elif core.constexpr(x.dtype.primitive_bitwidth) == 64: + idtype = core.int64 + else: + raise ValueError("Unsupported dtype") + + ileft = left.to(idtype, bitcast=True) + iright = right.to(idtype, bitcast=True) + ix = x.to(idtype, bitcast=True) + + cond = (left > right) ^ flip + ret = ix ^ core.where(cond, ileft ^ iright, zeros_like(ix)) + + if core.constexpr(ids.dtype.primitive_bitwidth) == 8: + idx_dtype = core.int8 + elif core.constexpr(ids.dtype.primitive_bitwidth) == 16: + idx_dtype = core.int16 + elif core.constexpr(ids.dtype.primitive_bitwidth) == 32: + idx_dtype = core.int32 + elif core.constexpr(ids.dtype.primitive_bitwidth) == 64: + idx_dtype = core.int64 + else: + raise ValueError("Unsupported dtype") + + ileft_idx = left_idx.to(idx_dtype, bitcast=True) + iright_idx = right_idx.to(idx_dtype, bitcast=True) + ix_idx = ids.to(idx_dtype, bitcast=True) + ret_idx = ix_idx ^ core.where(cond, ileft_idx ^ iright_idx, zeros_like(ix_idx)) + + return ret.to(x.dtype, bitcast=True), ret_idx.to(ids.dtype, bitcast=True) + + +@triton.jit +def _bitonic_merge( + x, ids, stage: core.constexpr, order: core.constexpr, n_dims: core.constexpr +): + """ + order_type 0 == ascending + order_type 1 == descending + order_type 2 == alternating + """ + n_outer: core.constexpr = x.numel >> n_dims + core.static_assert(stage <= n_dims) + # flip denotes whether to re-arrange sub-sequences of elements in ascending or + # descending order. + # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage + # if flip = 00110011... then all the elements will be re-arranged alternatingly (with + # a stride of 2) at this stage + if order == 2: + shape: core.constexpr = [n_outer * 2 ** (n_dims - 1 - stage), 2, 2**stage] + flip = core.reshape( + core.broadcast_to(core.arange(0, 2)[None, :, None], shape), x.shape + ) + else: + flip = order + # perform `stage` rounds of `compare-and-swap` + for i in core.static_range(stage): + x, ids = _compare_and_swap(x, ids, flip, i + (n_dims - stage), n_dims) + return x, ids + + +@triton.jit +def argsort(x, ids, dim: tl.constexpr, descending: core.constexpr): + # handle default dimension or check that it is the most minor dim + _dim: core.constexpr = dim + n_dims: core.constexpr = _log2(x.shape[_dim]) + for i in core.static_range(1, n_dims + 1): + x, ids = _bitonic_merge(x, ids, i, 2 if i < n_dims else descending, n_dims) + return x, ids + + +@triton.jit +def topk_stage2_kernel( + y_ptr, + index_ptr, + chunk_x, + chunk_index, + k: tl.constexpr, + N: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + DESCENDING: tl.constexpr, +): + cur_batch = tl.program_id(0) + chunk_x += cur_batch * N + chunk_index += cur_batch * N + y_ptr += cur_batch * k + index_ptr += cur_batch * k + + cols = tl.arange(0, BLOCK_SIZE) + mask = cols < N + + FILL_VALUE = tl.constexpr( + torch.finfo(torch.float32).min if DESCENDING else torch.finfo(torch.float32).max + ) + mask_index_val = ( + tl.constexpr(torch.iinfo(torch.int32).min) + if DESCENDING + else tl.constexpr(torch.iinfo(torch.int32).max) + ) + + chunk_x_val = tl.load(chunk_x + cols, mask=mask, other=FILL_VALUE).to(tl.float32) + chunk_index_val = tl.load(chunk_index + cols, mask=mask, other=mask_index_val).to( + tl.int32 + ) + + sorted_chunk_x, sorted_chunk_index = argsort( + chunk_x_val, chunk_index_val, 0, descending=DESCENDING + ) + tl.store(y_ptr + cols, sorted_chunk_x, mask=cols < k) + tl.store(index_ptr + cols, sorted_chunk_index, mask=cols < k) + + +def two_stage_topk(x, k, dim=-1, largest=True): + descending = True + if not largest: + descending = False + + topk_elem_cnt = x.shape[dim] + batch_size = math.prod(x.shape) // topk_elem_cnt + + if topk_elem_cnt < 1024: + chunk_size = 256 + else: + chunk_size = 1024 + + if chunk_size < k: + chunk_size = triton.next_power_of_2(k) + + chunk_num = triton.cdiv(topk_elem_cnt, chunk_size) + + stage1_out = torch.empty(batch_size * chunk_num * k, device=x.device, dtype=x.dtype) + stage1_out_idx = torch.empty( + batch_size * chunk_num * k, device=x.device, dtype=torch.int64 + ) + + out_shape = x.shape[:-1] + (k,) + stage2_out = torch.empty(out_shape, device=x.device, dtype=x.dtype) + stage2_out_idx = torch.empty(out_shape, device=x.device, dtype=torch.int64) + + topk_stage1_kernel[ + batch_size, + chunk_num, + ]( + stage1_out, # pointer to the output + stage1_out_idx, # pointer to the output + x, # pointer to the input + k, + topk_elem_cnt, + chunk_size, + descending, + ) + stage2_elem_cnt = chunk_num * k + BLOCK_SIZE = triton.next_power_of_2(stage2_elem_cnt) + + topk_stage2_kernel[batch_size,]( + stage2_out, + stage2_out_idx, + stage1_out, + stage1_out_idx, + k, + stage2_elem_cnt, + BLOCK_SIZE, + descending, + ) + + return (stage2_out, stage2_out_idx) + + +# For dispatcher +MAX_TINY_ROW = 1024 + +""" +Triton Top-K operator +========================================= + +Selects the "k" largest elements (and their indices) along the "last" +dimension of a 2-D input tensor. A fast path and a hierarchical path are +chosen automatically based on the row length "M". + +Algorithm selection +------------------- +- 1-stage kernel - used when M <= 1024 ("tiny" rows). + Each row is processed by one Triton launch. +- 2-stage kernel - used when M > 1024 ("large" rows). + The row is first tiled, each tile computes a local Top-K, and the partial + results are merged in a second stage. + +Interface & constraints +----------------------- +1. Only the last dimension can be reduced. +2. Input must be a 2-D tensor of shape (B, M). +3. Exactly k largest elements are returned. +4. Returned values are **sorted in descending order. + +Returns +------- +(values, indices) - both tensors have shape (B, k) and reside on the +same device as the input. + +""" + + +def topk( + x: torch.Tensor, + k: int, + *, + dim: int = -1, + largest: bool = True, + sorted: bool = True, + tiny_row_thresh: int = MAX_TINY_ROW, +): + if dim < 0: + dim += x.ndim + if dim != x.ndim - 1: + raise ValueError("only last-dim Top-K is implemented") + if x.ndim != 2: + raise ValueError("input tensor must be 2-D (batch, M)") + if not largest: + raise ValueError("only largest=True supported") + if not sorted: + raise ValueError("sorted=False not supported") + + if not x.is_contiguous(): + x = x.contiguous() + + row_len = x.shape[-1] + if row_len <= tiny_row_thresh: + # if (row_len <= tiny_row_thresh) and (k <= 8): + return one_stage_topk(x.view(-1, row_len), k) + else: + return two_stage_topk(x, k, dim=dim, largest=True) diff --git a/aiter/ops/triton/triton_decode_attention.py b/aiter/ops/triton/triton_decode_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..fb70bad0af6d39709b5acdcaacfe58970132a752 --- /dev/null +++ b/aiter/ops/triton/triton_decode_attention.py @@ -0,0 +1,769 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py +# which was originally adapted from +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py + +# Changes: +# - Add support for page size >= 1. + +# Copyright 2025 vLLM Team +# Copyright 2023-2024 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Memory-efficient attention for decoding. +It supports page size >= 1. +""" + +import logging + +import triton +import triton.language as tl + +is_hip_ = True + +logger = logging.getLogger(__name__) + +# Only print the following warnings when triton version < 3.2.0. +# The issue won't affect performance or accuracy. +if triton.__version__ < '3.2.0': + logger.warning( + "The following error message 'operation scheduled before its operands' " + "can be ignored.") + + +@triton.jit +def tanh(x): + # Tanh is just a scaled sigmoid + return 2 * tl.sigmoid(2 * x) - 1 + + +@triton.jit +def _fwd_kernel_stage1( + Q, + K_Buffer, + V_Buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + Att_Out, + stride_req_to_tokens_b, + stride_qbs, + stride_qh, + stride_buf_kbs, + stride_buf_kh, + stride_buf_vbs, + stride_buf_vh, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + kv_group_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DV: tl.constexpr, + BLOCK_N: tl.constexpr, + NUM_KV_SPLITS: tl.constexpr, + PAGE_SIZE: tl.constexpr, + logit_cap: tl.constexpr, + Lk: tl.constexpr, + Lv: tl.constexpr, +): + tl.assume(stride_req_to_tokens_b > 0) + tl.assume(stride_qbs > 0) + tl.assume(stride_qh > 0) + tl.assume(stride_buf_kbs > 0) + tl.assume(stride_buf_kh > 0) + tl.assume(stride_buf_vbs > 0) + tl.assume(stride_buf_vh > 0) + tl.assume(stride_mid_ob > 0) + tl.assume(stride_mid_oh > 0) + tl.assume(stride_mid_os > 0) + + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + split_kv_id = tl.program_id(2) + + cur_kv_head = cur_head // kv_group_num + + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_dv = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lk + mask_dv = offs_dv < Lv + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_req_idx = cur_batch + + off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d + # [BLOCK_DMODEL] + q = tl.load(Q + off_q, mask=mask_d, other=0.0) + + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, + cur_batch_seq_len) + + e_max = -float("inf") + e_sum = 0.0 + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + if split_kv_end > split_kv_start: + for start_n in range(split_kv_start, split_kv_end, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + #[BLOCK_N] + kv_page_number = tl.load( + Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + + offs_n // PAGE_SIZE, + mask=offs_n < split_kv_end, + other=0, + ) + kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE + offs_buf_k = (kv_loc[:, None] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + offs_d[None, :]) + # [BLOCK_N, BLOCK_DMODEL] + k = tl.load( + K_Buffer + offs_buf_k, + mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]), + other=0.0, + ) + # [1, BLOCK_DMODEL] * [BLOCK_N, BLOCK_DMODEL] -> .. -> [BLOCK_N] + # Equivalent to tl.dot(q, kt) + # TODO: Try tl.dot + qk = tl.sum(q[None, :] * k, 1) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * tanh(qk / logit_cap) + + qk = tl.where(offs_n < split_kv_end, qk, float("-inf")) + + offs_buf_v = (kv_loc[:, None] * stride_buf_vbs + + cur_kv_head * stride_buf_vh + offs_dv[None, :]) + # [BLOCK_N, BLOCK_DV] + v = tl.load( + V_Buffer + offs_buf_v, + mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]), + other=0.0, + ) + + n_e_max = tl.maximum(tl.max(qk, 0), e_max) + # TODO: Try use exp of 2 to approximate exp + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max) + acc *= re_scale + # [BLOCK_DV] + acc += tl.sum(p[:, None] * v, 0) + + e_sum = e_sum * re_scale + tl.sum(p, 0) + e_max = n_e_max + + offs_mid_o = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + offs_dv) + + tl.store( + Att_Out + offs_mid_o, + acc / e_sum, + mask=(mask_dv), + ) + + offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + Lv) + + tl.store( + Att_Out + offs_mid_o_1, + e_max + tl.log(e_sum), + ) + + +def _decode_att_m_fwd( + q, + k_buffer, + v_buffer, + att_out, + Req_to_tokens, + B_Seqlen, + num_kv_splits, + sm_scale, + page_size, + logit_cap, +): + BLOCK = 64 if not is_hip_ else 8 + + NUM_KV_SPLITS = num_kv_splits + Lk = k_buffer.shape[-1] + Lv = v_buffer.shape[-1] + + batch, head_num = q.shape[0], q.shape[1] + + grid = (batch, head_num, NUM_KV_SPLITS) + kv_group_num = q.shape[1] // k_buffer.shape[-2] + + num_warps = 4 + if kv_group_num != 1: + num_warps = 1 if is_hip_ else 2 + + BLOCK_DMODEL = triton.next_power_of_2(Lk) + BLOCK_DV = triton.next_power_of_2(Lv) + + _fwd_kernel_stage1[grid]( + q, + k_buffer, + v_buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + att_out, + Req_to_tokens.stride(0), + q.stride(0), + q.stride(1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + att_out.stride(0), + att_out.stride(1), + att_out.stride(2), + kv_group_num=kv_group_num, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DV=BLOCK_DV, + BLOCK_N=BLOCK, + NUM_KV_SPLITS=NUM_KV_SPLITS, + PAGE_SIZE=page_size, + logit_cap=logit_cap, + num_warps=num_warps, + num_stages=2, + Lk=Lk, + Lv=Lv, + ) + +@triton.heuristics( + values={ + "USE_MLS": lambda args: (args["kv_group_num"] >= args["BLOCK_H"]) and\ + (args["q_head_num"] % args["BLOCK_H"] == 0) and\ + (args["BLOCK_DMODEL"] <= args["Lk"]) and\ + (args["BLOCK_DMODEL"] * args["BLOCK_DMODEL_SPLIT"] + args["BLOCK_DPE"] <= args["Lk"]) + } +) +@triton.jit +def _fwd_grouped_kernel_stage1( + Q, + K_Buffer, + V_Buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + Att_Out, + stride_req_to_tokens_b, + stride_qbs, + stride_qh, + stride_buf_kbs, + stride_buf_kh, + stride_buf_vbs, + stride_buf_vh, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + kv_group_num: tl.constexpr, + q_head_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DMODEL_SPLIT: tl.constexpr, + BLOCK_DPE: tl.constexpr, + BLOCK_DV: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_H: tl.constexpr, + NUM_KV_SPLITS: tl.constexpr, + PAGE_SIZE: tl.constexpr, + logit_cap: tl.constexpr, + Lk: tl.constexpr, + Lv: tl.constexpr, + USE_MLS: tl.constexpr, +): + # TODO: Remove this after triton mls performance is improved + # TODO: Use mls inside for loop leads to vgpr splill, why? + USE_MLS = False + tl.assume(stride_req_to_tokens_b > 0) + tl.assume(stride_qbs > 0) + tl.assume(stride_qh > 0) + tl.assume(stride_buf_kbs > 0) + tl.assume(stride_buf_kh > 0) + tl.assume(stride_buf_vbs > 0) + tl.assume(stride_buf_vh > 0) + tl.assume(stride_mid_ob > 0) + tl.assume(stride_mid_oh > 0) + tl.assume(stride_mid_os > 0) + + cur_batch = tl.program_id(0) + cur_head_id = tl.program_id(1) + cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H) + split_kv_id = tl.program_id(2) + + if kv_group_num > BLOCK_H: + VALID_BLOCK_H: tl.constexpr = BLOCK_H + else: + VALID_BLOCK_H: tl.constexpr = kv_group_num + cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H) + mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H + mask_h = mask_h & (cur_head < q_head_num) + + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_dv = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lk + mask_dv = offs_dv < Lv + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_req_idx = cur_batch + + if BLOCK_DMODEL_SPLIT == 1: + if USE_MLS: + q = tl.matrix_load( + Q + cur_batch * stride_qbs, + shape=(q_head_num, Lk), + strides=(stride_qh, 1), + block_shape=(BLOCK_H, BLOCK_DMODEL), + offsets=(cur_head_id * BLOCK_H, 0), + ) + else: + offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :] + q = tl.load(Q + offs_q, + mask=(mask_h[:, None]) & (mask_d[None, :]), + other=0.0) + + if BLOCK_DPE > 0: + offs_dpe = BLOCK_DMODEL * BLOCK_DMODEL_SPLIT + tl.arange(0, BLOCK_DPE) + mask_dpe = offs_dpe < Lk + if USE_MLS: + qpe = tl.matrix_load( + Q + cur_batch * stride_qbs, + shape=(q_head_num, Lk), + strides=(stride_qh, 1), + block_shape=(BLOCK_H, BLOCK_DPE), + offsets=(cur_head_id * BLOCK_H, BLOCK_DMODEL * BLOCK_DMODEL_SPLIT), + ) + else: + off_qpe = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh + + offs_dpe[None, :]) + qpe = tl.load(Q + off_qpe, + mask=(mask_h[:, None]) & (mask_dpe[None, :]), + other=0.0) + + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, + cur_batch_seq_len) + + e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf") + e_sum = tl.zeros([BLOCK_H], dtype=tl.float32) + acc = tl.zeros([BLOCK_H, Lv], dtype=tl.float32) + + if split_kv_end > split_kv_start: + for start_n in range(split_kv_start, split_kv_end, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + kv_page_number = tl.load( + Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + + offs_n // PAGE_SIZE, + mask=offs_n < split_kv_end, + other=0, + ) + kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE + if BLOCK_DMODEL_SPLIT == 1: + offs_buf_k = (kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + offs_d[:, None]) + k = tl.load( + K_Buffer + offs_buf_k, + mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]), + other=0.0, + ) + qk = tl.dot(q, k.to(q.dtype)) + else: + qk = tl.zeros([BLOCK_H, BLOCK_N], dtype=tl.float32) + for start_d in range(0, BLOCK_DMODEL_SPLIT * BLOCK_DMODEL, BLOCK_DMODEL): + offs_d = start_d + tl.arange(0, BLOCK_DMODEL) + mask_d = offs_d < Lk + if USE_MLS: + q = tl.matrix_load( + Q + cur_batch * stride_qbs, + shape=(q_head_num, Lk), + strides=(stride_qh, 1), + block_shape=(BLOCK_H, BLOCK_DMODEL), + offsets=(cur_head_id * BLOCK_H, start_d), + ) + else: + offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :] + q = tl.load(Q + offs_q, + mask=(mask_h[:, None]) & (mask_d[None, :]), + other=0.0) + offs_buf_k = (kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + offs_d[:, None]) + k = tl.load( + K_Buffer + offs_buf_k, + mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]), + other=0.0, + ) + qk += tl.dot(q, k.to(q.dtype)) + if BLOCK_DPE > 0: + offs_buf_kpe = (kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_dpe[:, None]) + kpe = tl.load( + K_Buffer + offs_buf_kpe, + mask=(offs_n[None, :] < split_kv_end) & + (mask_dpe[:, None]), + other=0.0, + ) + qk += tl.dot(qpe, kpe.to(qpe.dtype)) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * tanh(qk / logit_cap) + + qk = tl.where(mask_h[:, None] & (offs_n[None, :] < split_kv_end), + qk, float("-inf")) + + offs_buf_v = (kv_loc[:, None] * stride_buf_vbs + + cur_kv_head * stride_buf_vh + offs_dv[None, :]) + v = tl.load( + V_Buffer + offs_buf_v, + mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]), + other=0.0, + ) + + n_e_max = tl.maximum(tl.max(qk, 1), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max[:, None]) + acc *= re_scale[:, None] + acc += tl.dot(p.to(v.dtype), v) + + e_sum = e_sum * re_scale + tl.sum(p, 1) + e_max = n_e_max + + offs_mid_o = (cur_batch * stride_mid_ob + + cur_head[:, None] * stride_mid_oh + + split_kv_id * stride_mid_os + offs_dv[None, :]) + + tl.store( + Att_Out + offs_mid_o, + acc / e_sum[:, None], + mask=(mask_h[:, None]) & (mask_dv[None, :]), + ) + + offs_mid_o_1 = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + Lv) + + tl.store( + Att_Out + offs_mid_o_1, + e_max + tl.log(e_sum), + mask=mask_h, + ) + + +def _decode_grouped_att_m_fwd( + q, + k_buffer, + v_buffer, + att_out, + Req_to_tokens, + B_Seqlen, + num_kv_splits, + sm_scale, + page_size, + logit_cap, +): + BLOCK = 32 + Lk = k_buffer.shape[-1] + Lv = v_buffer.shape[-1] + + # [TODO] work around shmem limit on MI3xx + if is_hip_ and Lk >= 576: + BLOCK = 16 + + BLOCK_DMODEL_SPLIT = 1 + if Lk == 576: + BLOCK_DMODEL = 128 + BLOCK_DPE = 64 + BLOCK_DMODEL_SPLIT = 4 + elif Lk == 288: + BLOCK_DMODEL = 256 + BLOCK_DPE = 32 + else: + BLOCK_DMODEL = triton.next_power_of_2(Lk) + BLOCK_DPE = 0 + BLOCK_DV = triton.next_power_of_2(Lv) + + batch, head_num = q.shape[0], q.shape[1] + kv_group_num = q.shape[1] // k_buffer.shape[-2] + + BLOCK_H = 16 + NUM_KV_SPLITS = num_kv_splits + grid = ( + batch, + triton.cdiv(head_num, min(BLOCK_H, kv_group_num)), + NUM_KV_SPLITS, + ) + + extra_kargs = {} + num_stages = 2 + if is_hip_: + extra_kargs = { + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } + num_stages = 1 + + _fwd_grouped_kernel_stage1[grid]( + q, + k_buffer, + v_buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + att_out, + Req_to_tokens.stride(0), + q.stride(0), + q.stride(1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + att_out.stride(0), + att_out.stride(1), + att_out.stride(2), + kv_group_num=kv_group_num, + q_head_num=head_num, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DMODEL_SPLIT=BLOCK_DMODEL_SPLIT, + BLOCK_DPE=BLOCK_DPE, + BLOCK_DV=BLOCK_DV, + BLOCK_N=BLOCK, + BLOCK_H=BLOCK_H, + NUM_KV_SPLITS=NUM_KV_SPLITS, + PAGE_SIZE=page_size, + logit_cap=logit_cap, + num_warps=2 if BLOCK_DMODEL_SPLIT > 1 else 4, + num_stages=num_stages, + Lk=Lk, + Lv=Lv, + **extra_kargs, + ) + + +# NOTE: 有必要拆分成两个 stage 吗? +@triton.jit +def _fwd_kernel_stage2( + Mid_O, + o, + B_Seqlen, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + stride_obs, + stride_oh, + NUM_KV_SPLITS: tl.constexpr, + BLOCK_DV: tl.constexpr, + Lv: tl.constexpr, +): + tl.assume(stride_mid_ob > 0) + tl.assume(stride_mid_oh > 0) + tl.assume(stride_mid_os > 0) + tl.assume(stride_obs > 0) + tl.assume(stride_oh > 0) + + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + + offs_d = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lv + + e_sum = 0.0 + e_max = -float("inf") + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d + offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv + + # TODO: 补偿 stage1 中的 split_kv_id 的 softmax + for split_kv_id in range(0, NUM_KV_SPLITS): + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, + cur_batch_seq_len) + + if split_kv_end > split_kv_start: + tv = tl.load(Mid_O + offs_v + split_kv_id * stride_mid_os, + mask=mask_d, + other=0.0) + tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os) + n_e_max = tl.maximum(tlogic, e_max) + + old_scale = tl.exp(e_max - n_e_max) + acc *= old_scale + exp_logic = tl.exp(tlogic - n_e_max) + acc += exp_logic * tv + + e_sum = e_sum * old_scale + exp_logic + e_max = n_e_max + + tl.store( + o + cur_batch * stride_obs + cur_head * stride_oh + offs_d, + acc / e_sum, + mask=mask_d, + ) + + +def _decode_softmax_reducev_fwd( + logits, + q, + o, + v_buffer, + b_seq_len, + num_kv_splits, +): + batch, head_num = q.shape[0], q.shape[1] + Lv = v_buffer.shape[-1] + BLOCK_DV = triton.next_power_of_2(Lv) + + NUM_KV_SPLITS = num_kv_splits + + extra_kargs = {} + if is_hip_: + extra_kargs = { + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } + + grid = (batch, head_num) + _fwd_kernel_stage2[grid]( + logits, + o, + b_seq_len, + logits.stride(0), + logits.stride(1), + logits.stride(2), + o.stride(0), + o.stride(1), + NUM_KV_SPLITS=NUM_KV_SPLITS, + BLOCK_DV=BLOCK_DV, + Lv=Lv, + num_warps=4, + num_stages=2, + **extra_kargs, + ) + + +def decode_attention_fwd_normal( + q, + k_buffer, + v_buffer, + o, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap=0.0, +): + _decode_att_m_fwd( + q, + k_buffer, + v_buffer, + attn_logits, + req_to_token, + b_seq_len, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) + _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len, + num_kv_splits) + + +def decode_attention_fwd_grouped( + q, + k_buffer, + v_buffer, + o, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap=0.0, +): + _decode_grouped_att_m_fwd( + q, + k_buffer, + v_buffer, + attn_logits, + req_to_token, + b_seq_len, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) + _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len, + num_kv_splits) + + +def decode_attention_fwd( + q, + k_buffer, + v_buffer, + o, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size=1, + logit_cap=0.0, +): + assert num_kv_splits == attn_logits.shape[2] + kv_group_num = q.shape[1] // v_buffer.shape[-2] + + if kv_group_num == 1: + # MHA + decode_attention_fwd_normal( + q, + k_buffer, + v_buffer, + o, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) + else: + # GQA/MQA/MLA + decode_attention_fwd_grouped( + q, + k_buffer, + v_buffer, + o, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) diff --git a/aiter/ops/triton/unified_attention.py b/aiter/ops/triton/unified_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa8328cfbe78c2bf0dd1d8e8c08f3f20d122aea --- /dev/null +++ b/aiter/ops/triton/unified_attention.py @@ -0,0 +1,1343 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Authors: +# - Burkhard Ringlein +# - Jan van Lunteren +# - Chih-Chieh Yang +# - Thomas Parnell + +import functools +import json +import os +from typing import Dict, Optional + +import torch +import triton +import triton.language as tl +from aiter import logger + +try: + from flash_attn import varlen_fwd_unified +except Exception: + varlen_fwd_unified = None + + +def _is_rocm() -> bool: + return getattr(torch.version, "hip", None) is not None + + +def _default_fp8_dtype() -> torch.dtype: + if _is_rocm() and hasattr(torch, "float8_e4m3fnuz"): + return torch.float8_e4m3fnuz + return torch.float8_e4m3fn + + +_IS_ROCM = _is_rocm() +float8_info = torch.finfo(_default_fp8_dtype()) + + +@triton.jit +def cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def apply_softcap(S, x): + Sdiv = S / x + p1 = tl.exp(Sdiv) + p2 = tl.exp(-Sdiv) + return x * (p1 - p2) / (p1 + p2) + + +@triton.jit +def find_seq_idx( + query_start_len_ptr, + target_idx, + num_seqs, + BLOCK_Q: tl.constexpr, + use_q_block_mode: tl.constexpr, +): + left: tl.int32 = 0 + right = num_seqs + while left < right: + mid = (left + right) // 2 + val = tl.load(query_start_len_ptr + mid) + mid_val = val // BLOCK_Q + mid if use_q_block_mode else val + + if mid_val <= target_idx: + left = mid + 1 + else: + right = mid + + return left - 1 + + +@triton.jit +def kernel_unified_attention_2d( + output_ptr, # [num_tokens, num_query_heads, head_size] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + sink_ptr, # [num_query_heads] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + qq_bias_ptr, # [num_query_tokens, num_query_tokens] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + out_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + qq_bias_stride_0: tl.int64, # int + BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int must be power of 2 + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_ALIBI_SQRT: tl.constexpr, # bool + USE_QQ_BIAS: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + USE_MM_PREFIX: tl.constexpr, # bool + MAX_MM_RANGES: tl.constexpr, # int + mm_prefix_range_ptr, # [num_seqs] - prefix length for each sequence + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int + USE_FP8: tl.constexpr, # bool + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, +): + tl.assume(stride_k_cache_0 > 0) + tl.assume(stride_k_cache_1 > 0) + tl.assume(stride_k_cache_2 > 0) + tl.assume(stride_k_cache_3 > 0) + tl.assume(stride_v_cache_0 > 0) + tl.assume(stride_v_cache_1 > 0) + tl.assume(stride_v_cache_2 > 0) + tl.assume(stride_v_cache_3 > 0) + tl.assume(query_ptr.to(tl.int64) >= 0) + + + q_block_global_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + BLOCK_Q = BLOCK_M // num_queries_per_kv + + seq_idx = find_seq_idx( + query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True + ) + + q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx + + q_block_local_idx = q_block_global_idx - q_block_start_idx + + cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) + cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1) + + cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index + + if q_block_local_idx * BLOCK_Q >= cur_batch_query_len: + return + + offs_m = tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_SIZE_PADDED) + offs_t = tl.arange(0, TILE_SIZE) + query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv + + query_offset_0 = cur_batch_in_all_start_index + query_pos + query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv + query_offset = ( + query_offset_0[:, None] * query_stride_0 + + query_offset_1[:, None] * query_stride_1 + + offs_d[None, :] + ) + + dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1) + query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1) + query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1) + + # Q : (BLOCK_M, HEAD_SIZE_PADDED) + Q = tl.load( + query_ptr + query_offset, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + other=0.0, + ) + + block_table_offset = seq_idx * block_table_stride + + if not USE_SINKS: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # context length for this particular sequences + context_len = seq_len - cur_batch_query_len + + # alibi slope for this head + if USE_ALIBI_SLOPES: + alibi_slope = tl.load( + alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0 + ) + + # query-query attention bias + if USE_QQ_BIAS: + qq_bias_row_ptrs = ( + qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 + ) # shape: [BLOCK_M] + + # compute the length of the longest sequence prefix spanned by any + # query token in the current q_block (q_block_local_idx) + max_seq_prefix_len = ( + context_len + + q_block_local_idx * BLOCK_Q + + (BLOCK_M - 1) // num_queries_per_kv + + 1 + ) + + if USE_MM_PREFIX: + # image bidirectional attention ranges require a full range + # including q_block padding to make sure doc mask is correct + max_seq_prefix_len = tl.maximum(max_seq_prefix_len, seq_len) + else: + # adjust for potential padding in the last q_block by considering the + # actual sequence length + max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) + + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond + # this prefix can be skipped) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) + + # ---- Sliding-window tile pruning -------------------- + # Default: keep previous global behavior + tile_start = 0 + tile_end = num_tiles + # TODO(Isotr0py): sliding window pruning with image bidirectional mask + if SLIDING_WINDOW > 0 and not USE_MM_PREFIX: + # Query rows covered by this Q-block + qpos_lo = q_block_local_idx * BLOCK_Q + qpos_hi = tl.minimum( + qpos_lo + (BLOCK_M - 1) // num_queries_per_kv, + cur_batch_query_len - 1, + ) + # For sliding window, each query position q can only attend to + # keys in the range [q_abs - SLIDING_WINDOW + 1, q_abs] + # where q_abs = context_len + q + # The union of allowed key positions for this Q-block is: + # [context_len + qpos_lo - SLIDING_WINDOW + 1, context_len + qpos_hi] + first_allowed_key = context_len + qpos_lo - SLIDING_WINDOW + 1 + last_allowed_key = context_len + qpos_hi + # Convert to tile indices and clamp + tile_start = tl.maximum(0, first_allowed_key // TILE_SIZE) + tile_end = tl.minimum((last_allowed_key // TILE_SIZE) + 1, num_tiles) + + # iterate through tiles (now limited to the sliding window range) + for j in range(tile_start, tile_end): + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len + + physical_block_idx = tl.load( + block_tables_ptr + block_table_offset + seq_offset // BLOCK_SIZE + ).to(tl.int64) + + v_offset = ( + physical_block_idx[:, None] * stride_v_cache_0 + + kv_head_idx * stride_v_cache_2 + + offs_d[None, :] * stride_v_cache_3 + + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1 + ) + + k_offset = ( + physical_block_idx[None, :] * stride_k_cache_0 + + kv_head_idx * stride_k_cache_2 + + offs_d[:, None] * stride_k_cache_3 + + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1 + ) + + # K : (HEAD_SIZE, TILE_SIZE) + K_load = tl.load( + key_cache_ptr + k_offset, + mask=dim_mask[:, None] & tile_mask[None, :], + other=0.0, + ) + + if K_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + K = K_load + else: + K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype) + else: + K = K_load + + # V : (TILE_SIZE, HEAD_SIZE) + V_load = tl.load( + value_cache_ptr + v_offset, + mask=dim_mask[None, :] & tile_mask[:, None], + other=0.0, + ) + + if V_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + V = V_load + else: + V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype) + else: + V = V_load + + # Compute attention mask: causal by default (key <= query) + query_abs_pos = context_len + query_pos[:, None] + seq_mask = seq_offset[None, :] <= query_abs_pos + + # Apply sliding window to base mask BEFORE mm_prefix OR. + # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix + if SLIDING_WINDOW > 0: + seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW) + + # PrefixLM: extend mask with bidirectional ranges for multimodal tokens. + # Applied AFTER sliding window so mm_prefix ranges override SW restriction. + if USE_MM_PREFIX: + for i in range(MAX_MM_RANGES): + range_start = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + ) + range_end = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1 + ) + + is_valid = range_start < range_end + q_in_range = ( + (query_abs_pos >= range_start) + & (query_abs_pos <= range_end) + & is_valid + ) + k_in_range = ( + (seq_offset[None, :] >= range_start) + & (seq_offset[None, :] <= range_end) + & is_valid + ) + seq_mask |= q_in_range & k_in_range + + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) + + S += scale * tl.dot(Q, K) + + if USE_SOFTCAP: + S = apply_softcap(S, softcap) + + S = tl.where( + query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf") + ) + + if USE_ALIBI_SLOPES: + if USE_ALIBI_SQRT: + relative_pos = seq_offset - (context_len + query_pos[:, None]) + alibi_offset = tl.where( + relative_pos <= 0, + -tl.sqrt((-relative_pos).to(tl.float32)), + 0.0, + ) + else: + alibi_offset = seq_offset - context_len + S += alibi_slope[:, None] * alibi_offset + + if USE_QQ_BIAS: + # compute key positions relative to query section + key_rel_pos = seq_offset - context_len # shape: [BLOCK_SIZE] + # load bias only for keys that correspond to queries + is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0 + qq_bias = tl.load( + qq_bias_row_ptrs + key_rel_pos[None, :], + mask=is_query_key[None, :], # avoid OOB for context keys + other=0.0, + ) + S += qq_bias + + # compute running maximum + # m_j : (BLOCK_M,) + m_j = tl.maximum(M, tl.max(S, axis=1)) + + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_j = tl.where(m_j > float("-inf"), m_j, 0.0) + + # P : (BLOCK_M, TILE_SIZE) + P = tl.exp(S - m_j[:, None]) + + # l_j : (BLOCK_M,) + l_j = tl.sum(P, axis=1) + + # alpha : (BLOCK_M, ) + alpha = tl.exp(M - m_j) + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc = acc * alpha[:, None] + + # update constants + L = L * alpha + l_j + M = m_j + + if SLIDING_WINDOW: + qpos_lo = q_block_local_idx * BLOCK_Q + V = tl.where( + (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0 + ) + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc += tl.dot(P.to(V.dtype), V) + + # epilogue + acc = acc / L[:, None] + if USE_FP8: + acc = acc * tl.load(out_scale) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + + output_offset = ( + query_offset_0[:, None] * output_stride_0 + + query_offset_1[:, None] * output_stride_1 + + offs_d[None, :] + ) + + tl.store( + output_ptr + output_offset, + acc, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + ) + + +@triton.jit +def kernel_unified_attention_3d( + segm_output_ptr, + # [num_tokens, num_query_heads, num_segments, head_size_padded] + segm_max_ptr, # [num_tokens, num_query_heads, num_segments] + segm_expsum_ptr, # [num_tokens, num_query_heads, num_segments] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] + value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + sink_ptr, # [num_query_heads] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + qq_bias_ptr, # [num_query_tokens, num_query_tokens] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + qq_bias_stride_0: tl.int64, # int + BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int, must be power of 2 + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_ALIBI_SQRT: tl.constexpr, # bool + USE_QQ_BIAS: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int + NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int + USE_MM_PREFIX: tl.constexpr, # bool + MAX_MM_RANGES: tl.constexpr, # int + mm_prefix_range_ptr, # [num_seqs] - prefix length for each sequence +): + tl.assume(stride_k_cache_0 > 0) + tl.assume(stride_k_cache_1 > 0) + tl.assume(stride_k_cache_2 > 0) + tl.assume(stride_k_cache_3 > 0) + tl.assume(stride_v_cache_0 > 0) + tl.assume(stride_v_cache_1 > 0) + tl.assume(stride_v_cache_2 > 0) + tl.assume(stride_v_cache_3 > 0) + tl.assume(block_table_stride >= 0) + tl.assume(query_stride_0 >= 0) + tl.assume(query_stride_1 >= 0) + tl.assume(qq_bias_stride_0 >= 0) + tl.assume(query_ptr.to(tl.int64) >= 0) + + + q_block_global_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + segm_idx = tl.program_id(2) + + seq_idx = find_seq_idx( + query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True + ) + + q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx + + q_block_local_idx = q_block_global_idx - q_block_start_idx + + cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) + cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1) + + cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index + + if q_block_local_idx * BLOCK_Q >= cur_batch_query_len: + return + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # number of segments for this particular sequence + num_segments = NUM_SEGMENTS_PER_SEQ + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) + + if segm_idx * tiles_per_segment * TILE_SIZE >= seq_len: + return + + offs_m = tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_SIZE_PADDED) + offs_t = tl.arange(0, TILE_SIZE) + query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv + + query_offset_0 = cur_batch_in_all_start_index + query_pos + query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv + query_offset = ( + query_offset_0[:, None] * query_stride_0 + + query_offset_1[:, None] * query_stride_1 + + offs_d[None, :] + ) + + dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1) + query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1) + query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1) + + # Q : (BLOCK_M, HEAD_SIZE_PADDED) + Q = tl.load( + query_ptr + query_offset, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + other=0.0, + ) + + block_table_offset = seq_idx * block_table_stride + + if USE_SINKS: + if segm_idx == 0: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + else: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) + + # context length for this particular sequences + context_len = seq_len - cur_batch_query_len + + # alibi slope for this head + if USE_ALIBI_SLOPES: + alibi_slope = tl.load( + alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0 + ) + + # query-query attention bias + if USE_QQ_BIAS: + qq_bias_row_ptrs = ( + qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 + ) # shape: [BLOCK_M] + + # compute the length of the longest sequence prefix spanned by any + # query token in the current q_block (q_block_local_idx) + max_seq_prefix_len = ( + context_len + + q_block_local_idx * BLOCK_Q + + (BLOCK_M - 1) // num_queries_per_kv + + 1 + ) + + # adjust for potential padding in the last q_block by considering the + # actual sequence length + max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) + + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond + # this prefix can be skipped) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) + + # ---- Sliding-window tile pruning -------------------- + # Default: keep previous global behavior + tile_start = 0 + tile_end = num_tiles + # TODO(Isotr0py): sliding window pruning with image bidirectional mask + if SLIDING_WINDOW > 0 and not USE_MM_PREFIX: + # Query rows covered by this Q-block + qpos_lo = q_block_local_idx * BLOCK_Q + qpos_hi = tl.minimum( + qpos_lo + (BLOCK_M - 1) // num_queries_per_kv, + cur_batch_query_len - 1, + ) + # For sliding window, each query position q can only attend to + # keys in the range [q_abs - SLIDING_WINDOW + 1, q_abs] + # where q_abs = context_len + q + # The union of allowed key positions for this Q-block is: + # [context_len + qpos_lo - SLIDING_WINDOW + 1, context_len + qpos_hi] + first_allowed_key = context_len + qpos_lo - SLIDING_WINDOW + 1 + last_allowed_key = context_len + qpos_hi + # Convert to tile indices and clamp + tile_start = tl.maximum(0, first_allowed_key // TILE_SIZE) + tile_end = tl.minimum((last_allowed_key // TILE_SIZE) + 1, num_tiles) + + # iterate through tiles (now limited to the sliding window range) + for j in range( + max(segm_idx * tiles_per_segment, tile_start), + min((segm_idx + 1) * tiles_per_segment, tile_end), + ): + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len + + physical_block_idx = tl.load( + block_tables_ptr + block_table_offset + seq_offset // BLOCK_SIZE + ).to(tl.int64) + + v_offset = ( + physical_block_idx[:, None] * stride_v_cache_0 + + kv_head_idx * stride_v_cache_2 + + offs_d[None, :] * stride_v_cache_3 + + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1 + ) + + k_offset = ( + physical_block_idx[None, :] * stride_k_cache_0 + + kv_head_idx * stride_k_cache_2 + + offs_d[:, None] * stride_k_cache_3 + + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1 + ) + + # K : (HEAD_SIZE, TILE_SIZE) + K_load = tl.load( + key_cache_ptr + k_offset, + mask=dim_mask[:, None] & tile_mask[None, :], + other=0.0, + ) + + if K_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + K = K_load + else: + K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype) + else: + K = K_load + + # V : (TILE_SIZE, HEAD_SIZE) + V_load = tl.load( + value_cache_ptr + v_offset, + mask=dim_mask[None, :] & tile_mask[:, None], + other=0.0, + ) + + if V_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + V = V_load + else: + V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype) + else: + V = V_load + + # Compute attention mask: causal by default (key <= query) + query_abs_pos = context_len + query_pos[:, None] + seq_mask = seq_offset[None, :] <= query_abs_pos + + # Apply sliding window to base mask BEFORE mm_prefix OR. + # Order must match FlexAttention: (causal AND sliding_window) OR mm_prefix + if SLIDING_WINDOW > 0: + seq_mask = seq_mask & ((query_abs_pos - seq_offset) < SLIDING_WINDOW) + + # PrefixLM: extend mask with bidirectional ranges for multimodal tokens. + # Applied AFTER sliding window so mm_prefix ranges override SW restriction. + if USE_MM_PREFIX: + for i in range(MAX_MM_RANGES): + range_start = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + ) + range_end = tl.load( + mm_prefix_range_ptr + seq_idx * MAX_MM_RANGES * 2 + i * 2 + 1 + ) + + is_valid = range_start < range_end + q_in_range = ( + (query_abs_pos >= range_start) + & (query_abs_pos <= range_end) + & is_valid + ) + k_in_range = ( + (seq_offset[None, :] >= range_start) + & (seq_offset[None, :] <= range_end) + & is_valid + ) + seq_mask |= q_in_range & k_in_range + + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) + S += scale * tl.dot(Q, K) + + if USE_SOFTCAP: + S = apply_softcap(S, softcap) + + S = tl.where( + query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf") + ) + + if USE_ALIBI_SLOPES: + if USE_ALIBI_SQRT: + relative_pos = seq_offset - (context_len + query_pos[:, None]) + alibi_offset = tl.where( + relative_pos <= 0, + -tl.sqrt((-relative_pos).to(tl.float32)), + 0.0, + ) + else: + alibi_offset = seq_offset - context_len + S += alibi_slope[:, None] * alibi_offset + + if USE_QQ_BIAS: + # compute key positions relative to query section + key_rel_pos = seq_offset - context_len # shape: [BLOCK_SIZE] + # load bias only for keys that correspond to queries + is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0 + qq_bias = tl.load( + qq_bias_row_ptrs + key_rel_pos[None, :], + mask=is_query_key[None, :], # avoid OOB for context keys + other=0.0, + ) + S += qq_bias + + # compute running maximum + # m_j : (BLOCK_M,) + m_j = tl.maximum(M, tl.max(S, axis=1)) + + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_j = tl.where(m_j > float("-inf"), m_j, 0.0) + + # P : (BLOCK_M, TILE_SIZE,) + P = tl.exp(S - m_j[:, None]) + + # l_j : (BLOCK_M,) + l_j = tl.sum(P, axis=1) + + # alpha : (BLOCK_M, ) + alpha = tl.exp(M - m_j) + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc = acc * alpha[:, None] + + # update constants + L = L * alpha + l_j + M = m_j + + if SLIDING_WINDOW: + qpos_lo = q_block_local_idx * BLOCK_Q + V = tl.where( + (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0 + ) + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc += tl.dot(P.to(V.dtype), V) + + segm_output_offset = ( + query_offset_0[:, None].to(tl.int64) + * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + segm_idx * HEAD_SIZE_PADDED + + tl.arange(0, HEAD_SIZE_PADDED)[None, :] + ) + tl.store( + segm_output_ptr + segm_output_offset, + acc, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + ) + segm_offset = ( + query_offset_0.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ) + + query_offset_1 * NUM_SEGMENTS_PER_SEQ + + segm_idx + ) + tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1) + tl.store(segm_expsum_ptr + segm_offset, L, mask=query_mask_0 & query_mask_1) + + +@triton.jit +def reduce_segments( + output_ptr, # [num_tokens, num_query_heads, head_size] + segm_output_ptr, + # [num_tokens, num_query_heads, max_num_segments, head_size] + segm_max_ptr, # [num_tokens, num_query_heads, max_num_segments] + segm_expsum_ptr, # [num_tokens, num_query_heads, max_num_segments] + seq_lens_ptr, # [num_seqs] + num_seqs, # int + num_query_heads: tl.constexpr, # int + out_scale_inv, # float32 + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + block_table_stride: tl.int64, # int + TILE_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int, must be power of 2 + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int + USE_FP8: tl.constexpr, # bool + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, +): + tl.assume(output_stride_0 >= 0) + tl.assume(output_stride_1 >= 0) + tl.assume(segm_max_ptr.to(tl.int64) >= 0) + + + query_token_idx = tl.program_id(0) + query_head_idx = tl.program_id(1) + + seq_idx = find_seq_idx( + query_start_len_ptr, query_token_idx, num_seqs, BLOCK_Q, False + ) + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # number of segments for this particular sequence + num_segments = NUM_SEGMENTS_PER_SEQ + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) + + # create masks for subsequent loads + act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE) + segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full( + [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32 + ) + dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1) + + # load segment maxima + segm_offset = ( + query_token_idx.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ) + + query_head_idx * NUM_SEGMENTS_PER_SEQ + + tl.arange(0, NUM_SEGMENTS_PER_SEQ) + ) + segm_max = tl.load(segm_max_ptr + segm_offset, mask=segm_mask, other=float("-inf")) + overall_max = tl.max(segm_max) + + # load and rescale segment exp sums + segm_expsum = tl.load(segm_expsum_ptr + segm_offset, mask=segm_mask, other=0.0) + segm_expsum = segm_expsum * tl.exp(segm_max - overall_max) + overall_expsum = tl.sum(segm_expsum) + + # load, rescale, and add segment attention outputs + segm_output_offset = ( + query_token_idx.to(tl.int64) + * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED + + tl.arange(0, HEAD_SIZE_PADDED)[None, :] + ) + segm_output = tl.load( + segm_output_ptr + segm_output_offset, + mask=segm_mask[:, None] & dim_mask[None, :], + other=0.0, + ) + segm_output *= tl.exp(segm_max - overall_max)[:, None] + acc_sum = tl.sum(segm_output, axis=0) + # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0 + acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum) + + if USE_FP8: + acc = acc * tl.load(out_scale_inv) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + + # write result + output_offset = ( + query_token_idx * output_stride_0 + + query_head_idx * output_stride_1 + + tl.arange(0, HEAD_SIZE_PADDED) + ) + tl.store(output_ptr + output_offset, acc, mask=dim_mask) + + +def _is_gemma3_attention(head_size: int, sliding_window: int) -> bool: + """Detect Gemma3 models via unique (head_size, sliding_window) signature. + + Gemma3 models are the only ones using sliding_window=1024 with + head_size 128 (27B) or 256 (1B, 4B, 12B). Other SWA models use + different window sizes (Mistral=4096, Phi-3=2047). + """ + return sliding_window == 1024 and head_size in (128, 256) + + +def _get_tile_size( + head_size: int, + sliding_window: int, + element_size: int, + is_prefill: bool, +) -> int: + """Select tile size with Gemma3-specific optimization. + + For Gemma3, use 32 for both prefill and decode to better utilize + the larger head dimension (128/256). For other models, use + the default vLLM behavior. + """ + if _is_gemma3_attention(head_size, sliding_window): + # Gemma3: use 32 for decode (default is 16) + return 32 + + # Default behavior + if is_prefill: + return 32 + return 16 if element_size >= 2 else 32 + + +def _get_unified_attention_config_dir() -> str: + return os.path.join(os.path.dirname(__file__), "configs", "unified_attention") + + +def _load_ranked_kernel_config(config_file_path: str, selector: int) -> Optional[Dict]: + if not os.path.exists(config_file_path): + return None + with open(config_file_path) as f: + payload = json.load(f) + configs = {int(key): val for key, val in payload["config"].items()} + if not configs: + return None + selector_padded = triton.next_power_of_2(max(1, selector)) + return configs[min(configs.keys(), key=lambda x: abs(x - selector_padded))] + + +@functools.lru_cache +def _get_unified_attention_2d_config( + cache_block_size: int, + head_size: int, + num_queries_per_kv: int, + sliding_window: int, + use_alibi_slopes: bool, + use_qq_bias: bool, + use_softcap: bool, + use_sinks: bool, + use_mm_prefix: bool, + use_fp8: bool, + kv_dtype: torch.dtype, +) -> Optional[Dict]: + kv_type = "fp8" if "float8" in str(kv_dtype) else "auto" + device_name = triton.runtime.driver.active.get_current_target().arch + head_size_padded = triton.next_power_of_2(head_size) + head_dim_pad_req = head_size != head_size_padded + json_file_name = ( + f"unified_attention_2d-device={device_name}" + f"-bs={cache_block_size}" + f"-hs={head_size}" + f"-sw={sliding_window}" + f"-alibi={int(use_alibi_slopes)}" + f"-qq={int(use_qq_bias)}" + f"-softcap={int(use_softcap)}" + f"-sinks={int(use_sinks)}" + f"-mm={int(use_mm_prefix)}" + f"-fp8={int(use_fp8)}" + f"-hsp={head_size_padded}" + f"-pad={int(head_dim_pad_req)}" + f"-kv={kv_type}.json" + ) + config_file_path = os.path.join(_get_unified_attention_config_dir(), json_file_name) + config = _load_ranked_kernel_config(config_file_path, num_queries_per_kv) + if config: + return config + logger.warning( + "\nUsing default unified_attention_2d kernel config. Performance might " + f"be sub-optimal! Config not found at {config_file_path}" + ) + return None + + +@functools.lru_cache +def _get_unified_attention_3d_config( + cache_block_size: int, + head_size: int, + num_queries_per_kv: int, + sliding_window: int, + use_alibi_slopes: bool, + use_qq_bias: bool, + use_softcap: bool, + use_sinks: bool, + use_mm_prefix: bool, + num_segments_per_seq: int, + kv_dtype: torch.dtype, +) -> Optional[Dict]: + kv_type = "fp8" if "float8" in str(kv_dtype) else "auto" + device_name = triton.runtime.driver.active.get_current_target().arch + head_size_padded = triton.next_power_of_2(head_size) + head_dim_pad_req = head_size != head_size_padded + json_file_name = ( + f"unified_attention_3d-device={device_name}" + f"-bs={cache_block_size}" + f"-hs={head_size}" + f"-sw={sliding_window}" + f"-alibi={int(use_alibi_slopes)}" + f"-qq={int(use_qq_bias)}" + f"-softcap={int(use_softcap)}" + f"-sinks={int(use_sinks)}" + f"-mm={int(use_mm_prefix)}" + f"-seg={num_segments_per_seq}" + f"-hsp={head_size_padded}" + f"-pad={int(head_dim_pad_req)}" + f"-kv={kv_type}.json" + ) + config_file_path = os.path.join(_get_unified_attention_config_dir(), json_file_name) + config = _load_ranked_kernel_config(config_file_path, num_queries_per_kv) + if config: + return config + logger.warning( + "\nUsing default unified_attention_3d kernel config. Performance might " + f"be sub-optimal! Config not found at {config_file_path}" + ) + return None + + +def unified_attention( + q, # [num_query_tokens, num_query_heads, head_size] + k, # [num_blks, blk_size, num_kv_heads, head_size] + v, # [num_blks, blk_size, num_kv_heads, head_size] + out, # [num_query_tokens, num_query_heads, head_size] + cu_seqlens_q, # [num_seqs + 1] + max_seqlen_q, # int + seqused_k, # [num_seqs] + max_seqlen_k, # int + softmax_scale, # float32 + causal, # bool + window_size, # (left_window, right_window), -1 means infinite context window + block_table, # [num_seqs, max_num_blocks_per_seq] + softcap, # float32 + q_descale, # None, q scales currently unsupported + k_descale, # [num_seqs, num_kv_heads] or None + v_descale, # [num_seqs, num_kv_heads] or None + seq_threshold_3D=None, # int + num_par_softmax_segments=None, # int + softmax_segm_output=None, # [seq_threshold_3D, num_query_heads, num_segments, head_size_padded] + softmax_segm_max=None, # [seq_threshold_3D, num_query_heads, num_segments] + softmax_segm_expsum=None, # [seq_threshold_3D, num_query_heads, num_segments] + alibi_slopes=None, # [num_query_heads] + output_scale=None, # float32 or None + qq_bias=None, # [num_query_tokens, num_query_tokens] + sinks=None, # [num_query_heads] or None + mm_prefix_range=None, # [num_seqs, max_mm_ranges, 2] or None + use_alibi_sqrt=False, # bool +): + assert causal, "Only causal attention is supported" + assert q_descale is None, "Q scales not supported" + + if sinks is not None: + assert sinks.shape[0] == q.shape[1], "Sinks must be num_query_heads size" + + use_mm_prefix = False + max_mm_ranges = 0 + if mm_prefix_range is not None: + if mm_prefix_range.ndim == 3: + use_mm_prefix = True + max_mm_ranges = mm_prefix_range.shape[1] + else: + raise ValueError( + f"Unsupported mm_prefix_range shape: {mm_prefix_range.shape}" + ) + + use_alibi_slopes = alibi_slopes is not None + use_qq_bias = qq_bias is not None + + block_size = v.shape[1] + num_seqs = len(seqused_k) + num_query_heads = q.shape[1] + num_kv_heads = k.shape[2] + num_queries_per_kv = num_query_heads // num_kv_heads + head_size = q.shape[2] + + BLOCK_M = ( + 16 if num_queries_per_kv <= 16 else triton.next_power_of_2(num_queries_per_kv) + ) + BLOCK_Q = BLOCK_M // num_queries_per_kv + + # Ideally we would launch with kernel with: + # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks. + # However, it is slow to realize the query_lens on cpu. + # Instead we use upper-bound: + # \sum_i[ceil(query_len[i] / BLOCK_Q)] + # <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1] + # = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs + # <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs + # = floor(q.shape[0] / BLOCK_Q) + num_seqs + total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs + # Tile sizes for prefill and decode. Gemma3 models use optimized values. + # Note: tile size must be at least 32 for fp8 (element_size == 1). + sliding_window_val = 1 + window_size[0] if window_size[0] >= 0 else 0 + TILE_SIZE_PREFILL = _get_tile_size( + head_size, + sliding_window_val, + q.element_size(), + is_prefill=True, + ) + TILE_SIZE_DECODE = _get_tile_size( + head_size, + sliding_window_val, + q.element_size(), + is_prefill=False, + ) + + # Launch the 2D kernel if + # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or + # 2. The batch includes at least one prefill request, or + # 3. The number of sequences exceeds the configured threshold + if ( + seq_threshold_3D is None + or num_par_softmax_segments is None + or softmax_segm_output is None + or softmax_segm_max is None + or softmax_segm_expsum is None + or max_seqlen_q > 1 + or num_seqs > seq_threshold_3D + ): + use_fa_unified_2d = ( + _IS_ROCM + and varlen_fwd_unified is not None + and block_size % 64 == 0 + and head_size == 256 + ) + if not use_fa_unified_2d: + config = _get_unified_attention_2d_config( + cache_block_size=block_size, + head_size=head_size, + num_queries_per_kv=num_queries_per_kv, + sliding_window=sliding_window_val, + use_alibi_slopes=use_alibi_slopes, + use_qq_bias=use_qq_bias, + use_softcap=(softcap > 0), + use_sinks=(sinks is not None), + use_mm_prefix=use_mm_prefix, + use_fp8=(output_scale is not None), + kv_dtype=k.dtype, + ) + if not config: + config = { + "BLOCK_M": BLOCK_M, + "TILE_SIZE": TILE_SIZE_PREFILL, + "num_warps": 4, + "num_stages": 2, + } + + BLOCK_Q = config["BLOCK_M"] // num_queries_per_kv + total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs + launch_config_2d = dict(config) + + grid_2d = (total_num_q_blocks, num_kv_heads) + kernel_unified_attention_2d[grid_2d]( + output_ptr=out, + query_ptr=q, + key_cache_ptr=k, + value_cache_ptr=v, + sink_ptr=sinks, + block_tables_ptr=block_table, + seq_lens_ptr=seqused_k, + alibi_slopes_ptr=alibi_slopes, + qq_bias_ptr=qq_bias, + scale=softmax_scale, + k_scale=k_descale, + v_scale=v_descale, + out_scale=1 / output_scale if output_scale is not None else 1.0, + softcap=softcap, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + block_table_stride=block_table.stride(0), + query_stride_0=q.stride(0), + query_stride_1=q.stride(1), + output_stride_0=out.stride(0), + output_stride_1=out.stride(1), + qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, + BLOCK_SIZE=block_size, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + USE_ALIBI_SLOPES=use_alibi_slopes, + USE_ALIBI_SQRT=use_alibi_sqrt, + USE_QQ_BIAS=use_qq_bias, + USE_SOFTCAP=(softcap > 0), + USE_SINKS=(sinks is not None), + USE_MM_PREFIX=use_mm_prefix, + MAX_MM_RANGES=max_mm_ranges, + mm_prefix_range_ptr=mm_prefix_range, + SLIDING_WINDOW=(1 + window_size[0]), + stride_k_cache_0=k.stride(0), + stride_k_cache_1=k.stride(1), + stride_k_cache_2=k.stride(2), + stride_k_cache_3=k.stride(3), + stride_v_cache_0=v.stride(0), + stride_v_cache_1=v.stride(1), + stride_v_cache_2=v.stride(2), + stride_v_cache_3=v.stride(3), + query_start_len_ptr=cu_seqlens_q, + num_seqs=num_seqs, + USE_FP8=output_scale is not None, + **launch_config_2d, + ) + else: + varlen_fwd_unified( + q=q, + k=k, + v=v, + cu_seqlens_q=cu_seqlens_q, + seqused_k=seqused_k, + block_table=block_table, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + softmax_scale=softmax_scale, + causal=causal, + softcap=softcap, + window_size=window_size, + alibi_slopes=alibi_slopes, + use_alibi_sqrt=use_alibi_sqrt, + qq_bias=qq_bias, + s_aux=sinks, + mm_prefix_range=mm_prefix_range, + return_softmax_lse=False, + out=out, + ) + else: + config_3d = _get_unified_attention_3d_config( + cache_block_size=block_size, + head_size=head_size, + num_queries_per_kv=num_queries_per_kv, + sliding_window=sliding_window_val, + use_alibi_slopes=use_alibi_slopes, + use_qq_bias=use_qq_bias, + use_softcap=(softcap > 0), + use_sinks=(sinks is not None), + use_mm_prefix=use_mm_prefix, + num_segments_per_seq=num_par_softmax_segments, + kv_dtype=k.dtype, + ) + if not config_3d: + config_3d = { + "BLOCK_M": BLOCK_M, + "TILE_SIZE": TILE_SIZE_DECODE, + "num_warps": 4, + "num_stages": 2, + } + + BLOCK_Q = config_3d["BLOCK_M"] // num_queries_per_kv + total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs + launch_config_3d = dict(config_3d) + launch_config_3d["BLOCK_Q"] = BLOCK_Q + + kernel_unified_attention_3d[ + (total_num_q_blocks, num_kv_heads, num_par_softmax_segments) + ]( + segm_output_ptr=softmax_segm_output, + segm_max_ptr=softmax_segm_max, + segm_expsum_ptr=softmax_segm_expsum, + query_ptr=q, + key_cache_ptr=k, + value_cache_ptr=v, + sink_ptr=sinks, + block_tables_ptr=block_table, + seq_lens_ptr=seqused_k, + alibi_slopes_ptr=alibi_slopes, + qq_bias_ptr=qq_bias, + scale=softmax_scale, + k_scale=k_descale, + v_scale=v_descale, + softcap=softcap, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + block_table_stride=block_table.stride(0), + query_stride_0=q.stride(0), + query_stride_1=q.stride(1), + qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, + BLOCK_SIZE=block_size, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + USE_ALIBI_SLOPES=use_alibi_slopes, + USE_ALIBI_SQRT=use_alibi_sqrt, + USE_QQ_BIAS=use_qq_bias, + USE_SOFTCAP=(softcap > 0), + USE_SINKS=(sinks is not None), + USE_MM_PREFIX=use_mm_prefix, + MAX_MM_RANGES=max_mm_ranges, + mm_prefix_range_ptr=mm_prefix_range, + SLIDING_WINDOW=(1 + window_size[0]), + stride_k_cache_0=k.stride(0), + stride_k_cache_1=k.stride(1), + stride_k_cache_2=k.stride(2), + stride_k_cache_3=k.stride(3), + stride_v_cache_0=v.stride(0), + stride_v_cache_1=v.stride(1), + stride_v_cache_2=v.stride(2), + stride_v_cache_3=v.stride(3), + query_start_len_ptr=cu_seqlens_q, + num_seqs=num_seqs, + NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments, + **launch_config_3d, + ) + + reduce_config = { + "num_warps": 1, + "num_stages": 1, + } + + reduce_launch_config = dict(reduce_config) + reduce_launch_config["TILE_SIZE"] = config_3d["TILE_SIZE"] + reduce_launch_config["BLOCK_Q"] = BLOCK_Q + reduce_segments[(q.shape[0], num_query_heads)]( + output_ptr=out, + segm_output_ptr=softmax_segm_output, + segm_max_ptr=softmax_segm_max, + segm_expsum_ptr=softmax_segm_expsum, + seq_lens_ptr=seqused_k, + num_seqs=num_seqs, + num_query_heads=num_query_heads, + out_scale_inv=1 / output_scale if output_scale is not None else 1.0, + output_stride_0=out.stride(0), + output_stride_1=out.stride(1), + block_table_stride=block_table.stride(0), + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + query_start_len_ptr=cu_seqlens_q, + NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments, + USE_FP8=output_scale is not None, + **reduce_launch_config, + ) diff --git a/aiter/ops/triton/utils/__init__.py b/aiter/ops/triton/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0bde01b04e250caa825555c8a1926b3fbcb23ad --- /dev/null +++ b/aiter/ops/triton/utils/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT + \ No newline at end of file diff --git a/aiter/ops/triton/utils/arch_info.py b/aiter/ops/triton/utils/arch_info.py new file mode 100644 index 0000000000000000000000000000000000000000..158875a891962b50108e95c9d5910494ebc05bef --- /dev/null +++ b/aiter/ops/triton/utils/arch_info.py @@ -0,0 +1,44 @@ +import torch +import triton + +# For now, there is 1-to-1 correspondence between arch and device +_ARCH_TO_DEVICE = { + "gfx928": "K100_AI", + "gfx936": "BW200", + "gfx938": "BW200B", +} + + +def get_arch(): + return triton.runtime.driver.active.get_current_target().arch + + +def get_device(): + return _ARCH_TO_DEVICE.get(get_arch(), "Unknown") + + +def is_fp4_avail(): + return get_arch() in ("gfx946") + + +def is_fp8_avail(): + return get_arch() in ("gfx938") + + +def get_fp8_dtypes(): + e5m2_dtype = torch.float8_e5m2 + e4m3_dtype = torch.float8_e4m3fn + return e5m2_dtype, e4m3_dtype + + +def get_fp8_e4m3_dtype(): + e4m3_dtype = torch.float8_e4m3fn + return e4m3_dtype + + +def get_num_sms(): + # Returns the Compute Unit count of the current device + current_device_index = torch.cuda.current_device() + current_device = torch.cuda.get_device_properties(current_device_index) + num_sms = current_device.multi_processor_count + return num_sms diff --git a/aiter/ops/triton/utils/common_utils.py b/aiter/ops/triton/utils/common_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..cbaefd0cd5082561046cc9b85d31c6f3832b24bc --- /dev/null +++ b/aiter/ops/triton/utils/common_utils.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: MIT + +import os +import json +import functools +from typing import List + +import torch +import triton +from triton.runtime.cache import default_cache_dir + + +def prev_power_of_2(x: int) -> int: + out = triton.next_power_of_2(x) + return out // 2 if out > x else out + + +STATIC_MAX_SEQ_LENS: List[int] = [] +USE_RUNTIME_MAX_SEQ_LEN: bool = False + + +def autotune_max_seq_len(runtime_max_seq_len: int) -> int: + global USE_RUNTIME_MAX_SEQ_LEN + + if USE_RUNTIME_MAX_SEQ_LEN: + return prev_power_of_2(runtime_max_seq_len) + else: + if STATIC_MAX_SEQ_LENS == []: + return 1 + for max_len in STATIC_MAX_SEQ_LENS: + if max_len >= runtime_max_seq_len: + return max_len + return STATIC_MAX_SEQ_LENS[-1] + + +def switch_to_contiguous_if_needed(x: torch.Tensor) -> torch.Tensor: + if x.stride(-1) == 1: + return x + return x.contiguous() + + +def get_triton_cache_dir(): + return os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir() + + +file_cache = {} + + +def save_kernel_path(filename: str, config: dict, kernel_path: str): + """ + config: kernel config + """ + key = str(config) + path_cache_dir = f"{get_triton_cache_dir()}/saved_kernel" + os.makedirs(path_cache_dir, exist_ok=True) + file_path = f"{path_cache_dir}/{filename}" + data = file_cache[file_path] if file_path in file_cache else {} + if key not in data: + data[key] = kernel_path + with open(file_path, "w") as f: + json.dump(data, f, indent=4) + file_cache[file_path] = data + + +@functools.lru_cache +def has_kernel_cache(path): + return False if not path or not os.path.isdir(f'{get_triton_cache_dir()}/{path}') else True diff --git a/aiter/ops/triton/utils/core.py b/aiter/ops/triton/utils/core.py new file mode 100644 index 0000000000000000000000000000000000000000..d270cc0216959659a337f6d7ebae61ac5755a62c --- /dev/null +++ b/aiter/ops/triton/utils/core.py @@ -0,0 +1,6 @@ +import os + + +this_dir = os.path.dirname(os.path.abspath(__file__)) +AITER_TRITON_OPS_PATH = os.path.abspath(f"{this_dir}/../") +AITER_TRITON_CONFIGS_PATH = os.path.abspath(f"{this_dir}/../configs") diff --git a/aiter/ops/triton/utils/la_kernel_utils.py b/aiter/ops/triton/utils/la_kernel_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9aa680cbeb1ef3b384f54be528313900991840c4 --- /dev/null +++ b/aiter/ops/triton/utils/la_kernel_utils.py @@ -0,0 +1,629 @@ +# SPDX-License-Identifier: MIT +import torch +import sys + + +# Support tensor in [B, Seqlen, H, d] format. Taking tensors in [B*Seqlen, H, d] as inputs +def persistent_lean_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + Mp: torch.Tensor, + Lp: torch.Tensor, + Op: torch.Tensor, # (total_programs, n_ctx_q, d) + locks: torch.Tensor, + batch_num_block_n: torch.Tensor, + total_programs: int, + BLOCK_M: int, + BLOCK_N: int, + causal: bool, + batch_size: int, + sm_scale: torch.float16, +): + # shape constraints + HEAD_DIM_Q, HEAD_DIM_K, HEAD_DIM_V = q.shape[-1], k.shape[-1], v.shape[-1] + assert ( + HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V + ), "Incompatible Q/K/V Hidden Dimensions" + assert HEAD_DIM_K in {16, 32, 64, 128, 256} + + N_CTX_Q = q.shape[0] // batch_size + N_CTX_K = k.shape[0] # This is the sum of all ctx_n in a batch + H = q.shape[1] + + BLOCK_RATIO = BLOCK_M // BLOCK_N + print(f"BLOCK_RATIO={BLOCK_RATIO}") + + qk_scale = sm_scale * 1.44269504 + + ( + num_m_blocks, + num_n_blocks, + high_load_wgs, + max_tiles_per_wg, + tiles_per_head, + total_programs, + num_splits, + even_split, + ) = get_num_splits_and_buffer_sizes( + causal, + batch_size, + N_CTX_Q, + N_CTX_K, + H, + H, + HEAD_DIM_Q, + BLOCK_M, + BLOCK_N, + total_programs, + ) + print( + f"high_load_wgs={high_load_wgs}, max_tiles_per_wg={max_tiles_per_wg}, tiles_per_head={tiles_per_head}" + ) + print( + f"total_programs={total_programs}, num_splits={num_splits}, even_split={even_split}" + ) + print(f"num_m_blocks={num_m_blocks}, num_n_blocks={num_n_blocks}") + + # grid = (total_programs, 1, 1) + + o = torch.empty_like(q, dtype=v.dtype) + + print( + f"q.stride(0)={q.stride(0)}, q.stride(1)={q.stride(1)}, q.stride(2)={q.stride(2)}" + ) + print( + f"k.stride(0)={k.stride(0)}, k.stride(1)={k.stride(1)}, k.stride(2)={k.stride(2)}" + ) + + for pid in range(total_programs): + la_persistent( + pid, + q, + k, + v, + qk_scale, + Mp, + Lp, + Op, + o, + batch_num_block_n, + locks, + q.stride(0), # N_CTX_Q + q.stride(1), # H + q.stride(2), # Head_Dim + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + Op.stride(0), # total_programs + Op.stride(1), # n_ctx_q + Op.stride(2), # head_dim + HEAD_DIM=HEAD_DIM_K, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + BLOCK_RATIO=BLOCK_RATIO, + batch_size=batch_size, + causal=causal, + num_m_blocks=num_m_blocks, + num_n_blocks=num_n_blocks, + # leanAttention params + high_load_wgs=high_load_wgs, + max_tiles_per_wg=max_tiles_per_wg, + tiles_per_head=tiles_per_head, + num_splits=num_splits, + ) + + +def get_num_splits_and_buffer_sizes( + causal, + batch_size, + max_seqlen_q, + max_seqlen_k, + num_heads, + num_heads_k, + head_size, + BLOCK_M, + BLOCK_N, + num_SMs, +): + ##### Lean Atteion: Calculate Splits and Tile Sizes ##### + ## based on onnxruntime/contrib_ops/cuda/bert/lean_attention + num_m_blocks = (max_seqlen_q + BLOCK_M - 1) // BLOCK_M + num_n_blocks = (max_seqlen_k + BLOCK_N - 1) // BLOCK_N + + # TODO: Support Grouped-Query Attention + max_seqlen_q = max_seqlen_q * num_heads // num_heads_k + + print(f"block_m: {BLOCK_M}, block_n: {BLOCK_N} ") + print(f"num_m_block: {num_m_blocks}, num_n_block: {num_n_blocks} ") + print(f"max_seqlen_q: {max_seqlen_q}, max_seqlen_k: {max_seqlen_k}") + print(f"num_heads: {num_heads}, num_heads_k: {num_heads_k} ") + + if max_seqlen_q == 1: + causal = False + + tiles_per_head = 0 + if causal: + # Prefill - Causal + for i in range(0, num_m_blocks): + tiles_per_head += (((i + 1) * BLOCK_M) + BLOCK_N - 1) // BLOCK_N + print(f"tiles_per_head={tiles_per_head}") + # Does not support ragged batch for causal. + tiles_per_head = tiles_per_head * batch_size + print(f"batch_size={batch_size}, tiles_per_head={tiles_per_head}") + else: + # Decode or Not Causal + tiles_per_head = num_m_blocks * num_n_blocks + + total_tiles = tiles_per_head * num_heads_k # Total tiles across all heads + print(f"total_tiles={total_tiles}") + # StreamK Lean has as many threadblocks as SMs + # This should be a function of tile size and number of scratchpad space + # LeanAttention assign 3 CTAs per SM (bounded by LDS size) + lean_griddimz = num_SMs # CTA launch grid + + # if (total_tiles <= 2 * 2 * num_SMs): + # lean_griddimz = min((total_tiles + 1) / 2, (32 * total_tiles + num_n_blocks - 1) / num_n_blocks) + # else: + # lean_griddimz = min(2 * num_SMs, 32 * num_heads_k * batch_size * num_m_blocks) + + # Max number lean tiles per task block (CTA) + # print(f"total_tiles={total_tiles}") + max_tiles_per_tb = (total_tiles + lean_griddimz - 1) // lean_griddimz + # print(f"lean_griddimz={lean_griddimz}, max_tiles_per_tb={max_tiles_per_tb}") + + # Find max number of splits + num_splits = 0 + even_split = False + if total_tiles % lean_griddimz == 0: + even_split = True + num_splits = 1 + ((num_n_blocks + max_tiles_per_tb - 2) // (max_tiles_per_tb)) + else: + even_split = False + num_splits = 1 + ( + (num_n_blocks + max_tiles_per_tb - 3) // (max_tiles_per_tb - 1) + ) + + # high_load_tbs is the remainder of total_tile / num_cta + high_load_tbs = total_tiles - ((max_tiles_per_tb - 1) * lean_griddimz) + + # Needed for causal. This is (per batch n_ctx) // BLOCK_N + num_n_blocks = num_n_blocks // batch_size + + return ( + num_m_blocks, + num_n_blocks, + high_load_tbs, + max_tiles_per_tb, + tiles_per_head, + lean_griddimz, + num_splits, + even_split, + ) + + +def find_group(x, BLOCK_RATIO): + group_id = 0 + total_blocks = 0 + while total_blocks + (group_id + 1) * BLOCK_RATIO <= x: + total_blocks += (group_id + 1) * BLOCK_RATIO + group_id += 1 + print(f"find_group(): x={x}, group_id={group_id}, total_blocks={total_blocks}") + group_size = (group_id + 1) * BLOCK_RATIO + return group_id, group_size, total_blocks + + +def la_persistent( + pid, + Q, + K, + V, + qk_scale, + Mp, + Lp, + Op, + Out, + batch_num_block_n, + locks, + stride_qm, # n_ctx_q + stride_qh, # Head + stride_qk, # head_dim + stride_kn, + stride_kh, + stride_kk, + stride_vn, + stride_vh, + stride_vk, + stride_om, # n_ctx_q + stride_oh, # Head + stride_on, # head_dim + stride_oph, # total_programs + stride_opm, # n_ctx_q + stride_opn, # head_dim + HEAD_DIM, + BLOCK_M, + BLOCK_N, + BLOCK_RATIO, + batch_size, + causal, + num_m_blocks, + num_n_blocks, + # leanAttention params + high_load_wgs, + max_tiles_per_wg, + tiles_per_head, + num_splits, +): + current_pid = pid + + if current_pid < high_load_wgs: + iter = max_tiles_per_wg * current_pid + cta_end_tile_gid = iter + max_tiles_per_wg + else: + iter = (max_tiles_per_wg - 1) * ( + current_pid - high_load_wgs + ) + high_load_wgs * max_tiles_per_wg + cta_end_tile_gid = iter + (max_tiles_per_wg - 1) + print( + f"current_pid={current_pid}, iter={iter}, cta_end_tile_gid={cta_end_tile_gid}" + ) + + # Loop context length + while iter < cta_end_tile_gid: + # Calculate index of current head output tile + # The tiles_per_head is the sum of # BLOCK_N in K/V sequence of all batches + tile_head_idx = iter // tiles_per_head + print(f" tile_head_idx={tile_head_idx}") + # To generate an otuput tile, a loop over [tile_iter, tile_iter_end) lean tiles is needed + # [tile_iter, tile_iter_end) are in the form of global tile id + if causal: + tile_batch_idx = (iter % tiles_per_head) // (tiles_per_head // batch_size) + # Does not support ragged batching. All requests in the batch have the same context length (per_head_tile_size) + # tiles_per_head: total sum of # BLOCK_N in K/V sequence of all batches + # per_head_tile_size: per head # BLOCK_N of each output tile + per_head_tile_idx, per_head_tile_size, total_blocks = find_group( + iter + - (tile_head_idx * tiles_per_head) + - (tile_batch_idx * (tiles_per_head // batch_size)), + BLOCK_RATIO, + ) + tile_iter = ( + tile_head_idx * tiles_per_head + + (tile_batch_idx * (tiles_per_head // batch_size)) + + total_blocks + ) + tile_iter_end = tile_iter + (per_head_tile_size) + tile_idx = ( + tile_head_idx * batch_size + tile_batch_idx + ) * num_m_blocks + per_head_tile_idx + print(f" causal: per_head_tile_idx={per_head_tile_idx}") + print(f" causal: per_head_tile_size={per_head_tile_size},") + print(f" causal: total_blocks={total_blocks}") + print(f" causal: tile_batch_idx={tile_batch_idx}") + else: + tile_idx = ( + tile_head_idx * batch_size + ) # Output tile idx, 1 output tile per head per batch + tile_iter = tile_head_idx * tiles_per_head + if batch_size == 1: + req_size = tiles_per_head + else: + # req_size = tl.load(batch_num_block_n) + req_size = batch_num_block_n[0] + tile_iter_end = tile_iter + req_size + for b in range(1, batch_size): + # next_req_size = tl.load(batch_num_block_n + b) + next_req_size = batch_num_block_n[b] + local_head_iter = iter % tiles_per_head + if (local_head_iter < next_req_size) and (local_head_iter >= req_size): + tile_iter = tile_iter + req_size + tile_idx = tile_idx + b + tile_iter_end = tile_iter + (next_req_size - req_size) + req_size = next_req_size + print( + f" tile_idx={tile_idx}, tile_iter={tile_iter}, tile_iter_end={tile_iter_end}" + ) + # Local lean tile ID within a loop of an output tile + local_iter = iter - tile_iter + # local_iter_end = tl.minimum(tile_iter_end, cta_end_tile_gid) - tile_iter + local_iter_end = min(tile_iter_end, cta_end_tile_gid) - tile_iter + print(f" local_iter={local_iter}, local_iter_end={local_iter_end}") + + if iter == tile_iter: + host_block = True + else: + host_block = False + # finishing_block: the output tile is finished within this block + if cta_end_tile_gid >= tile_iter_end: + finishing_block = True + else: + finishing_block = False + print(f" host_block={host_block}, finishing_block={finishing_block}") + offs_m = torch.arange(0, BLOCK_M) + offs_n = torch.arange(0, BLOCK_N) + offs_k = torch.arange(0, HEAD_DIM) + + if causal: + b_seq_size = tile_batch_idx * num_n_blocks + else: + tile_batch_idx = tile_idx % batch_size + b_seq_size = 0 + if tile_batch_idx > 0: + b_seq_size = 1 + # b_seq_size = tl.load( + # batch_num_block_n + tile_batch_idx - 1 + # ) # Previous batch size + + k_offs = ( + (b_seq_size + local_iter) * BLOCK_N * stride_kn + + tile_head_idx * stride_kh + + offs_n[None, :] * stride_kn + + offs_k[:, None] * stride_kk + ) + v_offs = ( + (b_seq_size + local_iter) * BLOCK_N * stride_vn + + tile_head_idx * stride_vh + + offs_n[:, None] * stride_vn + + offs_k[None, :] * stride_vk + ) + print( + f" b_seq_size={b_seq_size}, k_offs.shape={k_offs.shape}, k_offs={k_offs}" + ) + print( + f" b_seq_size={b_seq_size}, v_offs.shape={v_offs.shape}, v_offs={v_offs}" + ) + # k_ptrs = K + k_offs + # k_ptrs = tl.multiple_of(k_ptrs,(16,1)) + # v_ptrs = V + v_offs + # v_ptrs = tl.multiple_of(v_ptrs,(1,16)) + + if causal: + q_idx = per_head_tile_idx + tile_batch_idx * num_m_blocks + else: + q_idx = tile_batch_idx + q_offs = ( + q_idx * BLOCK_M * stride_qm + + tile_head_idx * stride_qh + + offs_m[:, None] * stride_qm + + offs_k[None, :] * stride_qk + ) + print(f" q_idx={q_idx}, q_offs.shape={q_offs.shape}, q_offs={q_offs}") + o_h_offs = ( + q_idx * BLOCK_M * stride_om + + tile_head_idx * stride_oh + + offs_m[:, None] * stride_om + + offs_k[None, :] * stride_on + ) + # print(f" q_idx={q_idx}, o_offs.shape={o_h_offs.shape}, o_offs={o_h_offs}") + # q_ptrs = Q + q_offs + # q_ptrs = tl.multiple_of(q_ptrs,(1,16)) + + # m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + # l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0 + # acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32) + + # q = tl.load(q_ptrs) + offs_m = torch.arange(BLOCK_M) + # OFFSM = q_idx * BLOCK_M + offs_m + offs_n = torch.arange(BLOCK_N) + for l_iter in range(local_iter, local_iter_end): + """ + if causal: + if (tile_iter_end - tile_iter) - l_iter <= BLOCK_RATIO: + OFFSN = (l_iter + tile_batch_idx * num_n_blocks) * BLOCK_N + offs_n + #mask = offs_m[:, None] >= offs_n[None, :] + mask = OFFSM[:, None] >= OFFSN[None, :] + #torch.set_printoptions(threshold=10000) + """ + if causal and (BLOCK_RATIO > 1): + if l_iter == (tile_iter_end - tile_iter) - 2: + mask = offs_m[:, None] >= offs_n[None, :] + torch.set_printoptions(threshold=10_000) + print(f" Inner loop: l_iter={l_iter}, mask.shape={mask.shape}") + torch.set_printoptions(threshold=10_000) + print(f" mask = {mask}") + # qk = tl.where(mask, qk, float("-inf")) + if l_iter == (tile_iter_end - tile_iter) - 1: + mask = (offs_m[:, None] >= BLOCK_N) & ( + offs_n[None, :] <= (offs_m[:, None] - BLOCK_N) + ) + # mask = offs_m[:, None] >= offs_n[None, :] + # qk = tl.where(mask, qk, float("-inf")) + torch.set_printoptions(threshold=10_000) + print(f" Inner loop: l_iter={l_iter}, mask.shape={mask.shape}") + torch.set_printoptions(threshold=10_000) + print(f" mask = {mask}") + if causal and (BLOCK_RATIO == 1): + # if (l_iter == (tile_iter_end - tile_iter) - 1): + if (iter + (l_iter - local_iter)) == (tile_iter_end - 1): + mask = offs_m[:, None] >= offs_n[None, :] + # qk = tl.where(mask, qk, float("-inf")) + + # if (l_iter == (tile_iter_end - tile_iter) - 1) and causal: + # mask = (offs_m[:, None] >= BLOCK_N) & (offs_n[None, :] <= (offs_m[:, None]-BLOCK_N)) + # print(f" Inner loop: l_iter={l_iter}, mask = {mask}") + + # print(f" Inner Loop: l_iter={l_iter}") + print(f" Inner loop: {local_iter} to {local_iter_end}") + + # lean output tile epilogue + if not host_block: + # Update pointers of partial results Mp[cta], Lp[cta], Op[cta] + mp_ptrs = Mp + current_pid * BLOCK_M + offs_m + lp_ptrs = Lp + current_pid * BLOCK_M + offs_m + op_ptrs = ( + Op + + current_pid * stride_oph # stride_oph is total_program dimension + + offs_m[:, None] * stride_opm + + offs_k[None, :] * stride_opn + ) + print(" Non host block write partial result") + print(f"mp_ptrs.shape={mp_ptrs.shape}") + print(f"mp_ptrs={mp_ptrs}") + print(f"op_ptrs={op_ptrs}") + # print(f"Mp.shape={Mp.shape}, Mp={Mp}") + + # tl.store(mp_ptrs, m_i, cache_modifier=".wt") + # tl.store(lp_ptrs, l_i, cache_modifier=".wt") + # tl.store(op_ptrs, acc, cache_modifier=".wt") + # tl.debug_barrier() + # tl.store(locks + current_pid, 1, cache_modifier=".wt") + # According to streamK gemm, store + cache_modifier won't work universally + # atomic_xchg is better solution but a less performant variant + # tl.atomic_xchg(locks + current_pid, 1) + + else: # host block + # A host block that is also a finishing block completes all the LeanTile iterations for its output tile + # in a single CTA and so can directly store its results from LeanTile() in global memory without any reduction + + o_h_offs = ( + q_idx * BLOCK_M * stride_om + + tile_head_idx * stride_oh + + offs_m[:, None] * stride_om + + offs_k[None, :] * stride_on + ) + print(f"o_h_offs={o_h_offs}") + # o_ptrs = Out + o_h_offs + if not finishing_block: + # if host not finishing_block: # another CTA is processing the end of the output tile and store partial results + """ + if causal: + q_idx = per_head_tile_idx + tile_batch_idx * num_m_blocks + else: + q_idx = tile_batch_idx + + o_h_offs = ( + q_idx * BLOCK_M * stride_om + + tile_head_idx * stride_oh + + offs_m[:, None] * stride_om + + offs_k[None, :] * stride_on + ) + o_ptrs = Out + o_h_offs + """ + + last_cta = current_pid + 1 + temp_end_gid = cta_end_tile_gid + split = 1 + while (split < num_splits) and (temp_end_gid < tile_iter_end): + if last_cta < high_load_wgs: + if (tile_iter_end - temp_end_gid) < max_tiles_per_wg: + temp_end_gid += tile_iter_end - temp_end_gid + else: + temp_end_gid += max_tiles_per_wg + else: + if (tile_iter_end - temp_end_gid) < (max_tiles_per_wg - 1): + temp_end_gid += tile_iter_end - temp_end_gid + else: + temp_end_gid += max_tiles_per_wg - 1 + + last_cta += 1 + split += 1 + # Next, load nonHost partial restult + for cta in range((current_pid + 1), last_cta): + print( + f" Host-NonFinishing block cta{cta} loop {current_pid + 1} to {last_cta}" + ) + + # Partial results are stored in [nonHost, Host-nonFinishing] layout + offs_mplp = cta * BLOCK_M + offs_m + mp_ptrs = Mp + offs_mplp + lp_ptrs = Lp + offs_mplp + op_h_offs = ( + cta * stride_oph + + offs_m[:, None] * stride_opm + + offs_k[None, :] * stride_opn + ) + print(f" Host-NonFinishing block offs_mplp={offs_mplp}") + print(f" Host-NonFinishing block mp_ptrs={mp_ptrs}") + print(f" Host-NonFinishing block lp_ptrs={lp_ptrs}") + print(f" Host-NonFinishing block op_h_offs={op_h_offs}") + # op_ptrs = Op + op_h_offs + + # update iter + iter = iter + (local_iter_end - local_iter) + + +def main(): + batch = 1 + causal = True + h = 1 + n_ctx_q = 512 + n_ctx = [512] + d = 128 + total_programs = 4 + + init_dtype = torch.float16 + BLOCK_M = 128 + BLOCK_N = 64 + assert batch == len(n_ctx) + + try: + sum_n_ctx = sum(int(n) for n in n_ctx) + except ValueError: + print(f"N_CTX contains non-numeric values: {n_ctx}") + + print(f"causal={causal}, batch={batch}") + # N_CTX is a list of context lengthes for all the req in a batch + # First, calculate #BLOCK_N for each context length "list_num_block_n" + # Second, Convert it to a list of assumulative lengthes "list_sum_block_n" + # Third, convert list to a tensor "batch_num_block_n" + for s in n_ctx: + list_num_block_n = [ + (int(str(s).strip()) + BLOCK_N - 1) // BLOCK_N for s in n_ctx + ] + len_sum = 0 + list_sum_block_n = [] + for i in range(batch): + len_sum += list_num_block_n[i] + list_sum_block_n.append(len_sum) + batch_num_block_n = torch.tensor(list_sum_block_n, dtype=torch.int32) + + sm_scale = 0.5 + + # Allocate Tensors + q = torch.empty((n_ctx_q * batch, h, d), dtype=init_dtype).normal_( + mean=0.0, std=0.5 + ) + k = torch.empty((sum_n_ctx, h, d), dtype=init_dtype).normal_(mean=0.0, std=0.5) + v = torch.empty((sum_n_ctx, h, d), dtype=init_dtype).normal_(mean=0.0, std=0.5) + + # LeanAttention Specific Parameters + # Mp = torch.empty((total_programs, n_ctx_q), device=q.device, dtype=torch.float32) + # Lp = torch.empty((total_programs, n_ctx_q), device=q.device, dtype=torch.float32) + # Op = torch.empty((total_programs, n_ctx_q, d), device=q.device, dtype=torch.float32) + Mp = torch.empty((total_programs, BLOCK_M), device=q.device, dtype=torch.float32) + Lp = torch.empty((total_programs, BLOCK_M), device=q.device, dtype=torch.float32) + Op = torch.empty((total_programs, BLOCK_M, d), device=q.device, dtype=torch.float32) + + locks = torch.zeros((total_programs,), device=q.device, dtype=torch.int32) + + # Triton LeanAttention output + persistent_lean_attention( + q, + k, + v, + Mp, + Lp, + Op, + locks, + batch_num_block_n, + total_programs, + BLOCK_M, + BLOCK_N, + causal, + batch, + sm_scale, + ) + + +if __name__ == "__main__": + sys.exit(main()) +# benchmark_params = BenchmarkArgs() +# args = benchmark_params.parse_args() +# bench_streamk(args.m, args.n, args.k, args.total_programs_streamk, str_to_dtype(args.in_dtype), str_to_dtype(args.out_dtype), args.BLK_M, args.BLK_N, args.BLK_K, args.gsize_m) diff --git a/aiter/ops/triton/utils/mha_kernel_utils.py b/aiter/ops/triton/utils/mha_kernel_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..47a940eacc82b9327aaacbde583c7c36b855bcb9 --- /dev/null +++ b/aiter/ops/triton/utils/mha_kernel_utils.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: MIT + +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils.arch_info as arch_info + + +@triton.jit +def _compute_fp8_scaling_factors(x, fp8_max: tl.constexpr): + # compute fp8 scaling and descaling factor for a block + x_amax = tl.max(tl.abs(x)) # NOTE: abs deals with negative values + x_amax = tl.where(x_amax <= 1e-9, 1e-9, x_amax) + scale_x = fp8_max / x_amax + descale_x = x_amax / fp8_max + return scale_x, descale_x + + +def _is_fp8(x): + if x.dtype in { + torch.float8_e4m3fnuz, + torch.float8_e4m3fn, + torch.float8_e5m2, + torch.float8_e5m2fnuz, + }: + if arch_info.is_fp8_avail(): + return True + else: + raise RuntimeError("This device does not support fp8") + else: + return False diff --git a/aiter/ops/triton/utils/moe_common.py b/aiter/ops/triton/utils/moe_common.py new file mode 100644 index 0000000000000000000000000000000000000000..ed35bdd06a75be888873edda8af5214049e56d35 --- /dev/null +++ b/aiter/ops/triton/utils/moe_common.py @@ -0,0 +1,22 @@ +import triton +import triton.language as tl + + +@triton.jit +def _write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, +): + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) diff --git a/aiter/ops/triton/utils/moe_config_utils.py b/aiter/ops/triton/utils/moe_config_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..64db129890be880f5edf958069b2828f04fd951d --- /dev/null +++ b/aiter/ops/triton/utils/moe_config_utils.py @@ -0,0 +1,276 @@ +# SPDX-License-Identifier: MIT + +import torch +from typing import Any, Dict, Optional, List +import os +import json +import functools +import aiter.ops.triton.utils.arch_info as arch_info +from aiter.ops.triton.utils.core import AITER_TRITON_CONFIGS_PATH +from aiter import logger + +M_THRESHOLD_SMALL = 256 +M_THRESHOLD_MEDIUM = 1024 + +def get_config_file_name(E: int, + N: int, + dtype: Optional[str], + block_shape: Optional[list[int]] = None, + is_bottom: bool = False) -> str: + device_name = arch_info.get_device() + if device_name == 'BW200B' or device_name == "BW100B": + device_name = 'BW200B' + elif device_name == 'BW200' or device_name.upper().startswith('BW'): + device_name = 'BW200' + dtype_selector = "" if not dtype else f",dtype={dtype}" + is_bottom_selector = ("" if is_bottom == False else ",is_bottom=True") + block_shape_selector = ("" if not block_shape or not all(block_shape) else + f",block_shape={block_shape}").replace(" ", "") + return f"E={E},N={N},device_name={device_name}{dtype_selector}{is_bottom_selector}{block_shape_selector}.json" # noqa: E501 + +def get_config_dtype_str( + dtype: torch.dtype, + use_int4_w4a16: Optional[bool] = False, + use_int8_w8a16: Optional[bool] = False, + use_int8_w8a8: Optional[bool] = False, + use_fp8_w8a8: Optional[bool] = False, + use_int4_w4a8: Optional[bool] = False, + use_mxfp4_w4a4: Optional[bool] = False) -> Optional[str]: + if use_fp8_w8a8: + return "fp8_w8a8" + elif use_int8_w8a8: + return "int8_w8a8" + elif use_int8_w8a16: + return "int8_w8a16" + elif use_int4_w4a16: + return "int4_w4a16" + elif use_int4_w4a8: + return "int4_w4a8" + elif use_mxfp4_w4a4: + return "mxfp4_w4a4" + elif dtype == torch.float: + # avoiding cases where kernel fails when float32 MoE + # use fp16/bfloat16 configs + return "float32" + return None + +def get_default_config( + M: int, + E: int, + N: int, + K: int, + topk: int, + dtype: Optional[str], + is_marlin: bool, + block_shape: Optional[list[int]] = None, + is_bottom: bool = False, +) -> dict[str, int]: + if dtype == "fp8_w8a8" and block_shape is not None: + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] + # BLOCK_SIZE_K must be divisible by block_shape[1] + # num_stages=3 can cause triton.runtime.errors.OutOfResources + # on ROCm, set it to 2 instead. + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_shape[0], + "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 32, + "COMBINE_SCALE_LOAD": False, + "num_warps": 4, + "num_stages": 2, + } + elif dtype in ["int4_w4a16", "int8_w8a16"] and block_shape is not None: + # moe wna16 kernels + # only set BLOCK_SIZE_M + # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later + bit = 4 if dtype == "int4_w4a16" else 8 + if M <= 20: + config = {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 1, "COMBINE_SCALE_LOAD": False} + elif M <= 40: + config = {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 1, "COMBINE_SCALE_LOAD": False} + else: + config = {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 1, "COMBINE_SCALE_LOAD": False} + elif is_marlin: + for block_size_m in [8, 16, 32, 48, 64]: + if M * topk / E / block_size_m < 0.9: + break + return {"BLOCK_SIZE_M": block_size_m} + elif M <= E: + config = { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "COMBINE_SCALE_LOAD": False, + } + else: + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "COMBINE_SCALE_LOAD": False, + } + return config + +def closest_power_of_two(a, dtype): + MIN_CONF_E = 2 + MAX_CONF_E = 16 if dtype == "int8_w8a8" else 32 + if a <= MIN_CONF_E: + return MIN_CONF_E + if a >= MAX_CONF_E: + return MAX_CONF_E + upper = 1 + while upper < a: + upper <<= 1 + lower = upper >> 1 + if a - lower <= upper - a: + return lower + else: + return upper + +@functools.lru_cache +def get_moe_configs( + E: int, + N: int, + dtype: Optional[str], + block_n: Optional[int] = None, + block_k: Optional[int] = None, + is_bottom: bool = False, +) -> Optional[dict[int, Any]]: + """ + Return optimized configurations for the fused MoE kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the fused_moe kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + block_shape = [block_n, block_k] if block_n and block_k else None + json_file_name = get_config_file_name(E, N, dtype, block_shape, is_bottom) + + config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "moe", json_file_name) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info("Using configuration from %s for MoE layer.", + config_file_path) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + elif is_bottom: + # if config with is_bottom json file not found, try to fallback use config without bottom json. + fallback_json_file_name = get_config_file_name(E, N, dtype, block_shape) + fallback_config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "moe", fallback_json_file_name) + + if os.path.exists(fallback_config_file_path): + with open(fallback_config_file_path) as f: + logger.info("Using fallback configuration from %s for MoE layer.", + fallback_config_file_path) + return {int(key): val for key, val in json.load(f).items()} + + # for EP mode, local experts num may not match any config file, try to find nearest E which is power of two + nearestE = closest_power_of_two(E, dtype) + fallback_json_file_name = get_config_file_name(nearestE, N, dtype, block_shape, is_bottom) + fallback_config_file_path = os.path.join( + f"{AITER_TRITON_CONFIGS_PATH}", "moe", fallback_json_file_name) + + if os.path.exists(fallback_config_file_path): + with open(fallback_config_file_path) as f: + logger.info("Using fallback configuration from %s for MoE layer.", + fallback_config_file_path) + return {int(key): val for key, val in json.load(f).items()} + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ("Using default MoE config. Performance might be sub-optimal! " + "Config file not found at %s"), config_file_path) + return None + +def try_get_optimal_moe_config( + w_shape: tuple[int, ...], + top_k: int, + dtype: Optional[str], + M: int, + is_marlin: bool = False, + block_shape: Optional[list[int]] = None, + is_bottom: bool = False, +): + + # First try to load optimal config from the file + # we use is_bottom to judge whethor is w1 or w2 + if is_bottom: + # w2_weight + E, K, N = w_shape + if dtype == "int4_w4a16" or dtype == "int4_w4a8": + # for int4_w4a16, N(intermediate_size) for w2_weight will be packed, N * 2 for get intermediate_size before packed + N = N * 2 + else: + # w1_weight + E, N, K = w_shape + # for w1_weight intermediate_size is merged by two weights, so N//2 for real intermediate_size. + N = N // 2 + + block_n = block_shape[0] if block_shape else 0 + block_k = block_shape[1] if block_shape else 0 + configs = get_moe_configs(E, N, dtype, block_n, block_k, is_bottom) + + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + + if is_bottom: + # Note: this is a hcu optimize to save memory. + # max_block_m = max([cfg["BLOCK_SIZE_M"] for key, cfg in configs.items()]) + max_block_m = max([cfg["BLOCK_SIZE_M"] for key, cfg in configs.items() if key <= M]) + max_block_m = max(max_block_m, config["BLOCK_SIZE_M"]) + return config, max_block_m + else: + # When is_bottom=False, return config only + return config + else: + # Else use the default config + config = get_default_config(M, E, N, K, top_k, dtype, + is_marlin, block_shape, is_bottom) + if is_bottom: + max_block_m = config["BLOCK_SIZE_M"] + return config, max_block_m + else: + return config + +def get_optimal_moe_config_func( + A: torch.Tensor, + W: torch.Tensor, + topk_ids: torch.Tensor, + use_int8_w8a16: Optional[bool] = False, + use_int8_w8a8: Optional[bool] = False, + use_fp8_w8a8: Optional[bool] = False, + use_int4_w4a16: Optional[bool] = False, + use_int4_w4a8: Optional[bool] = False, + use_mxfp4_w4a4: Optional[bool] = False, + block_shape: Optional[List[int]] = None, + is_bottom: bool = False, +): + config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + use_int4_w4a8=use_int4_w4a8, + use_mxfp4_w4a4=use_mxfp4_w4a4, + dtype=A.dtype) + + return functools.partial( + try_get_optimal_moe_config, + W.size(), + topk_ids.size(1), + config_dtype, + block_shape=block_shape, + is_bottom=is_bottom, + ) diff --git a/aiter/ops/triton/utils/pid_preprocessing.py b/aiter/ops/triton/utils/pid_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..53bb818d694adf472bf33f55a5850d7cc07859e1 --- /dev/null +++ b/aiter/ops/triton/utils/pid_preprocessing.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: MIT + + +import triton +import triton.language as tl + + +@triton.jit +def remap_xcd_chunked( + pid, GRID_MN, NUM_XCDS: tl.constexpr = 8, CHUNK_SIZE: tl.constexpr = 2 +): + # Compute current XCD and local PID + xcd = pid % NUM_XCDS + # distribute the modulo pids in round robin + if pid > (GRID_MN // (NUM_XCDS * CHUNK_SIZE)) * (NUM_XCDS * CHUNK_SIZE): + return pid + local_pid = pid // NUM_XCDS + # Calculate chunk index and position within chunk + chunk_idx = local_pid // CHUNK_SIZE + pos_in_chunk = local_pid % CHUNK_SIZE + # Calculate new PID + new_pid = chunk_idx * NUM_XCDS * CHUNK_SIZE + xcd * CHUNK_SIZE + pos_in_chunk + return new_pid + + +@triton.jit +def remap_xcd(pid, GRID_MN, NUM_XCDS: tl.constexpr = 8): + ## pid remapping on xcds + # Number of pids per XCD in the new arrangement + pids_per_xcd = (GRID_MN + NUM_XCDS - 1) // NUM_XCDS + # When GRID_MN cannot divide NUM_XCDS, some xcds will have + # pids_per_xcd pids, the other will have pids_per_xcd - 1 pids. + # We calculate the number of xcds that have pids_per_xcd pids as + # tall_xcds + tall_xcds = GRID_MN % NUM_XCDS + tall_xcds = NUM_XCDS if tall_xcds == 0 else tall_xcds + # Compute current XCD and local pid within the XCD + xcd = pid % NUM_XCDS + local_pid = pid // NUM_XCDS + # Calculate new pid based on the new grouping + # Note that we need to consider the following two cases: + # 1. the current pid is on a tall xcd + # 2. the current pid is on a short xcd + if xcd < tall_xcds: + pid = xcd * pids_per_xcd + local_pid + else: + pid = ( + tall_xcds * pids_per_xcd + + (xcd - tall_xcds) * (pids_per_xcd - 1) + + local_pid + ) + + return pid + + +@triton.jit +def pid_grid(pid: int, num_pid_m: int, num_pid_n: int, GROUP_SIZE_M: tl.constexpr = 1): + """ + Maps 1D pid to 2D grid coords (pid_m, pid_n). + + Args: + - pid: 1D pid + - num_pid_m: grid m size + - num_pid_n: grid n size + - GROUP_SIZE_M: tl.constexpr: default is 1 + """ + if GROUP_SIZE_M == 1: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + else: + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + return pid_m, pid_n diff --git a/aiter/ops/triton/utils/types.py b/aiter/ops/triton/utils/types.py new file mode 100644 index 0000000000000000000000000000000000000000..2d40a197be33946e167c542f6d92bce34f50fb9d --- /dev/null +++ b/aiter/ops/triton/utils/types.py @@ -0,0 +1,53 @@ +import torch +import triton.language as tl +from . import arch_info + +e5m2_dtype, e4m3_dtype = arch_info.get_fp8_dtypes() +str_to_torch_dtype = { + "float64": torch.float64, + "float32": torch.float32, + "fp32": torch.float32, + "bfloat16": torch.bfloat16, + "bf16": torch.bfloat16, + "float16": torch.float16, + "fp16": torch.float16, + "float8_e5m2": e5m2_dtype, + "float8_e4m3fn": e4m3_dtype, + "e5m2fn": e5m2_dtype, + "e4m3fn": e4m3_dtype, + "fp8e4m3": e4m3_dtype, + "fp8e5m2": e5m2_dtype, + "int64": torch.int64, + "int32": torch.int32, + "int16": torch.int16, + "uint8": torch.uint8, + "int8": torch.int8, + "mxfp4_e2m1": torch.uint8, # OCP MXFP4 packs two 4-bits into 8-bit +} + +torch_to_triton_dtype = { + torch.float64: tl.float64, + torch.float32: tl.float32, + torch.float16: tl.float16, + torch.bfloat16: tl.bfloat16, + torch.float8_e4m3fn: tl.float8e4nv, + torch.float8_e4m3fnuz: tl.float8e4b8, + torch.float8_e5m2: tl.float8e5, + torch.float8_e5m2fnuz: tl.float8e5b16, + torch.int64: tl.int64, + torch.int32: tl.int32, + torch.int16: tl.int16, + torch.int8: tl.int8, + torch.uint8: tl.uint8, +} + + +def get_dtype_max(dtype): + if torch.is_floating_point(torch.tensor([], dtype=dtype)): + return torch.finfo(dtype).max + else: + return torch.iinfo(dtype).max + +def get_fp8_dtypes(): + return arch_info.get_fp8_dtypes() + diff --git a/aiter/paged_attn.py b/aiter/paged_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..e7a4b408ddc6356319f311da65b32e14b0aa084f --- /dev/null +++ b/aiter/paged_attn.py @@ -0,0 +1,447 @@ +""" + +* Copyright (c) 2024, The vLLM team. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +""" + +from typing import List, Optional, Tuple +import torch +import aiter as ops +from aiter import dtypes +from dataclasses import dataclass + + +# from vllm.utils import is_hip +def is_hip(): + return True + + +# if HAS_TRITON: +# from vllm.attention.ops.prefix_prefill import context_attention_fwd + +# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +_PARTITION_SIZE = 512 if not is_hip() else 1024 +_PARTITION_SIZE_ROCM = 256 +_DEVICE_PROPERTIES = torch.cuda.get_device_properties("cuda") +_ON_NAVI = ( + hasattr(_DEVICE_PROPERTIES, "gcnArchName") + and "gfx1" in torch.cuda.get_device_properties("cuda").gcnArchName +) + + +# page attention ops +def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + block_size: int, + max_seq_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +) -> None: + ops.paged_attention_v1( + out, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + + +def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + block_size: int, + max_seq_len: int, + alibi_slopes: Optional[torch.Tensor], + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +) -> None: + ops.paged_attention_v2( + out, + exp_sum, + max_logits, + tmp_out, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + + +@dataclass +class PagedAttentionMetadata: + """Metadata for PagedAttention.""" + + # (batch_size,). The length of sequences (entire tokens seen so far) per + # sequence. + seq_lens_tensor: Optional[torch.Tensor] + # Maximum sequence length in the batch. 0 if it is prefill-only batch. + max_decode_seq_len: int + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: Optional[torch.Tensor] + + +def _use_rocm_custom_paged_attention( + qtype: torch.dtype, + head_size: int, + block_size: int, + gqa_ratio: int, + max_seq_len: int, +) -> bool: + # rocm custom page attention not support on navi (gfx1*) + return ( + not _ON_NAVI + and (qtype == torch.half or qtype == dtypes.bf16) + and (head_size == 64 or head_size == 128) + and (block_size == 16 or block_size == 32) + and (gqa_ratio >= 1 and gqa_ratio <= 16) + and max_seq_len <= 65536 + ) + + +class PagedAttention: + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [64, 80, 96, 112, 120, 128, 192, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (2, num_blocks, block_size * num_kv_heads * head_size) + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 16 // kv_cache.element_size() + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + asm_layout=False, + ) -> None: + ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping.flatten(), + kv_cache_dtype, + k_scale, + v_scale, + asm_layout, + ) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + max_seq_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + fp8_out_scale=None, + ) -> torch.Tensor: + # Whether to use rocm custom paged attention or not + num_seqs, num_heads, head_size = query.shape + block_size = value_cache.shape[3] + gqa_ratio = num_heads // num_kv_heads + use_custom = _use_rocm_custom_paged_attention( + query.dtype, head_size, block_size, gqa_ratio, max_seq_len + ) + output = torch.empty_like(query) + if use_custom: + max_num_partitions = ( + max_seq_len + _PARTITION_SIZE_ROCM - 1 + ) // _PARTITION_SIZE_ROCM + assert _PARTITION_SIZE_ROCM % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=dtypes.fp32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + cpa_fp8_out = False + if fp8_out_scale is not None: + output = torch.empty_like(output, dtype=dtypes.fp8) + cpa_fp8_out = True + ops.paged_attention_rocm( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + fp8_out_scale if cpa_fp8_out else None, + _PARTITION_SIZE_ROCM, + ) + if cpa_fp8_out: + return output.view(num_seqs, num_heads * head_size) + else: + max_num_partitions = (max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE + if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: + # use blocksparse paged attention + block_size = value_cache.size(-1) + assert ( + blocksparse_block_size > 0 + and blocksparse_block_size % block_size == 0 + ), ( + f"{blocksparse_block_size=} needs to be a multiple of" + f"{block_size=} used in block_tables." + ) + + # NOTE(woosuk): We use a simple heuristic to decide whether to use + # PagedAttention V1 or V2. If the number of partitions is 1, we use + # V1 to avoid the overhead of reduction. Also, if the number of + # sequences or heads is large, we use V1 since there is enough work + # to parallelize. + # TODO(woosuk): Tune this heuristic. + # For context len > 8192, use V2 kernel to avoid shared memory shortage. + use_v1 = max_seq_len <= 8192 and ( + max_num_partitions == 1 or num_seqs * num_heads > 512 + ) + + if use_v1: + # Run PagedAttention V1. + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=dtypes.fp32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + return output + + # @staticmethod + # def forward_prefix( + # query: torch.Tensor, + # key: torch.Tensor, + # value: torch.Tensor, + # kv_cache_dtype: str, + # key_cache: torch.Tensor, + # value_cache: torch.Tensor, + # block_tables: torch.Tensor, + # query_start_loc: torch.Tensor, + # seq_lens_tensor: torch.Tensor, + # context_lens: torch.Tensor, + # max_query_len: int, + # alibi_slopes: Optional[torch.Tensor], + # sliding_window: Optional[int], + # k_scale: float, + # v_scale: float, + # ) -> torch.Tensor: + # output = torch.empty_like(query) + # context_attention_fwd( + # query, + # key, + # value, + # output, + # kv_cache_dtype, + # key_cache, + # value_cache, + # block_tables, + # # query_start_loc is (batch_size + 1,) + # query_start_loc[:-1], + # seq_lens_tensor, + # context_lens, + # max_query_len, + # k_scale, + # v_scale, + # alibi_slopes, + # sliding_window, + # ) + # return output + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + src_key_cache = src_kv_cache[0] + dst_key_cache = dst_kv_cache[0] + ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + + src_value_cache = src_kv_cache[1] + dst_value_cache = dst_kv_cache[1] + ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) diff --git a/aiter/rotary_embedding.py b/aiter/rotary_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..3c4912bac5847f9c646682cb0d66bb7f6d8ce2f8 --- /dev/null +++ b/aiter/rotary_embedding.py @@ -0,0 +1,1877 @@ + +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py +# Copyright (C) 2023-2025 The vLLM team. +# Copyright (C) 2022-2025 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Rotary Positional Embeddings.""" + +import math +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from dataclasses import dataclass +from aiter import ( + dtypes, + fused_qk_norm_mrope_3d_cache_pts_quant_shuffle, + fused_qk_norm_rope_cache_pts_quant_shuffle, +) + +# from custom_op import CustomOp + +import os + +AITER_ROPE_TRITON_BACKEND = int(os.environ.get("AITER_ROPE_TRITON_BACKEND", 0)) == 1 +AITER_ROPE_NATIVE_BACKEND = int(os.environ.get("AITER_ROPE_NATIVE_BACKEND", 0)) == 1 +AITER_ROPE_FUSED_QKNORM = int(os.environ.get("AITER_ROPE_FUSED_QKNORM", 0)) == 1 + + +def _rotate_neox(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) + + +def _apply_rotary_emb( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool, +) -> torch.Tensor: + """ + Args: + x: [num_tokens, num_heads, head_size] + cos: [num_tokens, head_size // 2] + sin: [num_tokens, head_size // 2] + is_neox_style: Whether to use the Neox-style or GPT-J-style rotary + positional embeddings. + """ + cos = cos.unsqueeze(-2).to(x.dtype) + sin = sin.unsqueeze(-2).to(x.dtype) + if is_neox_style: + x1, x2 = torch.chunk(x, 2, dim=-1) + else: + x1 = x[..., ::2] + x2 = x[..., 1::2] + o1 = x1 * cos - x2 * sin + o2 = x2 * cos + x1 * sin + if is_neox_style: + return torch.cat((o1, o2), dim=-1) + else: + return torch.stack((o1, o2), dim=-1).flatten(-2) + + +# class RotaryEmbedding(CustomOp): +class RotaryEmbedding(nn.Module): + """Original rotary positional embedding.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.dtype = dtype + + cos, sin = self._compute_cos_sin_cache() + cos = cos.to(dtype) + sin = sin.to(dtype) + self.cos_cache: torch.Tensor + self.sin_cache: torch.Tensor + self.register_buffer("cos_cache", cos, persistent=False) + self.register_buffer("sin_cache", sin, persistent=False) + + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / ( + base + ** ( + torch.arange(0, self.rotary_dim, 2, dtype=dtypes.fp32) / self.rotary_dim + ) + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + t = torch.arange(self.max_position_embeddings, dtype=dtypes.fp32) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos().unsqueeze(-2).unsqueeze(-2) + sin = freqs.sin().unsqueeze(-2).unsqueeze(-2) + return cos, sin + + def forward(self, *args, **kwargs): + if AITER_ROPE_TRITON_BACKEND: + return self.forward_triton(*args, **kwargs) + elif AITER_ROPE_NATIVE_BACKEND: + return self.forward_native(*args, **kwargs) + else: + return self.forward_hip(*args, **kwargs) + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + is_nope_first=False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """A PyTorch-native implementation of forward().""" + if offsets is not None: + positions = positions + offsets.view_as(positions) + positions = positions.flatten() + num_tokens = positions.shape[0] + # cos_sin = self.cos_sin_cache.index_select(0, positions) + # cos, sin = cos_sin.chunk(2, dim=-1) + cos = self.cos_cache.index_select(0, positions).squeeze(-2).squeeze(-2) + sin = self.sin_cache.index_select(0, positions).squeeze(-2).squeeze(-2) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = ( + query[..., : self.rotary_dim] + if not is_nope_first + else query[..., -self.rotary_dim :] + ) + query_pass = ( + query[..., self.rotary_dim :] + if not is_nope_first + else query[..., : -self.rotary_dim] + ) + query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) + query = ( + torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + if not is_nope_first + else torch.cat((query_pass, query_rot), dim=-1).reshape(query_shape) + ) + + if key is None: + return query + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = ( + key[..., : self.rotary_dim] + if not is_nope_first + else key[..., -self.rotary_dim :] + ) + key_pass = ( + key[..., self.rotary_dim :] + if not is_nope_first + else key[..., : -self.rotary_dim] + ) + key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) + key = ( + torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + if not is_nope_first + else torch.cat((key_pass, key_rot), dim=-1).reshape(key_shape) + ) + return query, key + + # def forward_cuda( + def forward_old( + self, + positions: torch.Tensor, + # if is_nope_first + # [num_tokens, num_heads, nope_size+rope_size] + # if NOT is_nope_first + # [num_tokens, num_heads, rope_size+nope_size], + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + is_nope_first=False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # from vllm import _custom_ops as ops + import aiter as ops + + self.cos_cache = self.cos_cache.to(query.device, dtype=query.dtype) + self.sin_cache = self.sin_cache.to(query.device, dtype=query.dtype) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_cache, + self.sin_cache, + self.is_neox_style, + is_nope_first, + self.rotary_dim, + offsets, + ) + else: + ops.rotary_embedding_fwd( + positions, + query, + key, + self.head_size, + self.cos_cache, + self.sin_cache, + self.is_neox_style, + is_nope_first, + ) + return query, key + + def forward_hip( + self, + positions: torch.Tensor, + # if is_nope_first + # [[batch_size, seq_len, num_heads, nope_size+rope_size] + # if NOT is_nope_first + # [[batch_size, seq_len, num_heads, rope_size+nope_size], + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + is_nope_first=False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + import aiter as ops + + assert ( + self.cos_cache.dtype == query.dtype + ), f"cos_cache dtype ({self.cos_cache.dtype}) does not match query dtype ({query.dtype})" + cos, sin = self.cos_cache, self.sin_cache + + rotate_style = 0 if self.is_neox_style else 1 + + num_tokens = positions.numel() + + query_shape = query.shape + query = query.view(1, num_tokens, -1, self.head_size) + if key is not None: + key_shape = key.shape + key = key.view(1, num_tokens, -1, self.head_size) + + positions = positions.view(*query.shape[:2]) + if offsets is not None: + offsets = offsets.view(*query.shape[:2]) + + if not is_nope_first: + query_ = query[..., : self.rotary_dim] + key_ = key[..., : self.rotary_dim] if key is not None else None + else: + query_ = query[..., -self.rotary_dim :] + key_ = key[..., -self.rotary_dim :] if key is not None else None + + if key_ is not None: + if offsets is None: + ops.rope_cached_positions_2c_fwd_inplace( + query_, + key_, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + else: + ops.rope_cached_positions_offsets_2c_fwd_inplace( + query_, + key_, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + return query.view(query_shape), key.view(key_shape) + else: + if offsets is None: + ops.rope_cached_positions_fwd_inplace( + query_, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + else: + ops.rope_cached_positions_offsets_fwd_inplace( + query_, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + return query.view(query_shape) + + def forward_triton( + self, + positions: torch.Tensor, + # if is_nope_first + # [[batch_size, seq_len, num_heads, nope_size+rope_size] + # if NOT is_nope_first + # [[batch_size, seq_len, num_heads, rope_size+nope_size], + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + is_nope_first=False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + import aiter.ops.triton.rope as ops + + self.cos_cache = self.cos_cache.to(query.device, dtype=query.dtype) + self.sin_cache = self.sin_cache.to(query.device, dtype=query.dtype) + cos, sin = self.cos_cache, self.sin_cache + + rotate_style = 0 if self.is_neox_style else 1 + + num_tokens = positions.numel() + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + if key is not None: + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + + positions = positions.view(*query.shape[:1]) + if offsets is not None: + offsets = offsets.view(*query.shape[:1]) + + if not is_nope_first: + query_ = query[..., : self.rotary_dim] + key_ = key[..., : self.rotary_dim] if key is not None else None + else: + query_ = query[..., -self.rotary_dim :] + key_ = key[..., -self.rotary_dim :] if key is not None else None + + if key_ is not None: + if offsets is None: + ops.rope_cached_thd_positions_2c_fwd_inplace( + query_, + key_, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + else: + ops.rope_cached_thd_positions_offsets_2c_fwd_inplace( + query_, + key_, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + return query.view(query_shape), key.view(key_shape) + else: + if offsets is None: + ops.rope_cached_positions_fwd_inplace( + query_, + cos, + sin, + positions, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + else: + ops.rope_cached_positions_offsets_fwd_inplace( + query_, + cos, + sin, + positions, + offsets, + rotate_style, + reuse_freqs_front_part=True, + nope_first=is_nope_first, + ) + return query.view(query_shape) + + def extra_repr(self) -> str: + s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" + s += f", max_position_embeddings={self.max_position_embeddings}" + s += f", base={self.base}, is_neox_style={self.is_neox_style}" + return s + + +class LinearScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with linear scaling. + + It supports multiple scaling factors. Since multiple LoRA adapters may have + different scaling factors, we need multiple cos/sin caches. In this way, + instead of running rotary embedding kernel per lora, we can run multiple + lora in a batched way. + + In addition to that, we also keep the cos/sin cache for the scaling factor + of 1 (default) at all times. + + Exemplary for two scaling factors x=1, y and z with embeddings + [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and + [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and + [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]], + + we construct the cos/sin cache as follows: + [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p], + ... + [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]] + + We then use offsets to index into the cos/sin cache for + the respective scaling factors. + + The offset to cache can be accessed via `scaling_factor_to_offset` API. + + Credits to the Reddit user /u/kaiokendev + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factors: Union[List[float], float], + dtype: torch.dtype, + ) -> None: + if isinstance(scaling_factors, float): + scaling_factors = [scaling_factors] + self.scaling_factors: List[float] = scaling_factors # noqa + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + # Lazy initialized. + self._scaling_factor_to_offset: Dict[float, int] + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.base) + cache_list: List[torch.Tensor] = [] + # offsets to the next cache in a tensor. + # Each offset corresponds to the same index in scaling_factors. + offsets: List[int] = [] + for scaling_factor in self.scaling_factors: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * scaling_factor + t = torch.arange(max_len, dtype=dtypes.fp32) + t = t / scaling_factor + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + if not cache_list: + offset = 0 + else: + last_offset = offsets[-1] + next_max_len = cache_list[-1].shape[0] + offset = last_offset + next_max_len + offsets.append(offset) + cache_list.append(cache) + self._scaling_factor_to_offset = { + float(scaling_factor): offsets[i] + for i, scaling_factor in enumerate(self.scaling_factors) + } + assert len(self.scaling_factors) == len(offsets) + return torch.cat(cache_list, dim=0) + + @property + def scaling_factor_to_offset(self) -> Dict[float, int]: + return self._scaling_factor_to_offset + + +class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK scaling. + + Credits to the Reddit users /u/bloc97 and /u/emozilla + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + ) -> None: + self.scaling_factor = scaling_factor + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * self.scaling_factor + base = self.base * ( + (self.scaling_factor * max_len / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.rotary_dim / (self.rotary_dim - 2)) + inv_freq = self._compute_inv_freq(base) + t = torch.arange(max_len, dtype=dtypes.fp32) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache + + +# Inverse dim formula to find dim based on number of rotations +def _yarn_find_correction_dim( + num_rotations: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048, +) -> float: + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + +# Find dim range bounds based on rotations +def _yarn_find_correction_range( + low_rot: int, + high_rot: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048, +) -> Tuple[int, int]: + low = math.floor( + _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) + ) + high = math.ceil( + _yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) + ) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def _yarn_linear_ramp_mask( + low: float, high: float, dim: int, dtype: torch.dtype +) -> torch.Tensor: + if low == high: + high += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def _yarn_get_mscale(scale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 + + +class YaRNScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation + self.mscale = float(_yarn_get_mscale(self.scaling_factor) * attn_factor) + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base ** ( + torch.arange(0, self.rotary_dim, 2, dtype=dtypes.fp32) / self.rotary_dim + ) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = _yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + self.rotary_dim, + self.base, + self.max_position_embeddings, + ) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = ( + 1 + - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=dtypes.fp32) + ) * self.extrapolation_factor + inv_freq = ( + inv_freq_interpolation * (1 - inv_freq_mask) + + inv_freq_extrapolation * inv_freq_mask + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange( + self.max_position_embeddings * self.scaling_factor, dtype=dtypes.fp32 + ) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() * self.mscale + sin = freqs.sin() * self.mscale + cache = torch.cat((cos, sin), dim=-1) + return cache + + +class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): + """Phi3 family of models scaled rotary embedding. + + Based on the original RotaryEmbedding implementation. + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + original_max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + short_factor: List[float], + long_factor: List[float], + short_mscale: Optional[float] = None, + long_mscale: Optional[float] = None, + ): + super().__init__() + + if rotary_dim != head_size: + raise ValueError( + f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \ + rotary_dim != head_size ({rotary_dim}!={head_size})." + ) + if is_neox_style is False: + raise ValueError( + "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style." + ) + + self.head_size = head_size + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.base = base + self.short_factor = short_factor + self.long_factor = long_factor + + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt( + 1 + math.log(scale) / math.log(self.original_max_position_embeddings) + ) + if short_mscale is None: + short_mscale = scaling_factor + if long_mscale is None: + long_mscale = scaling_factor + + self.short_mscale = short_mscale + self.long_mscale = long_mscale + + short_cache = self._compute_cos_sin_cache( + original_max_position_embeddings, short_factor, short_mscale + ) + short_cache = short_cache.to(dtype) + self.register_buffer("short_cos_sin_cache", short_cache, persistent=False) + + long_cache = self._compute_cos_sin_cache( + max_position_embeddings, long_factor, long_mscale + ) + long_cache = long_cache.to(dtype) + self.register_buffer("long_cos_sin_cache", long_cache, persistent=False) + + long_short_cache = torch.cat( + [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0 + ) + self.register_buffer( + "long_short_cos_sin_cache", long_short_cache, persistent=False + ) + + def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor: + rescale_factors = torch.tensor(rescale_factors, dtype=dtypes.fp32) + inv_freq = 1.0 / ( + rescale_factors + * ( + self.base + ** ( + torch.arange(0, self.head_size, 2, dtype=dtypes.fp32) + / self.head_size + ) + ) + ) + return inv_freq + + def _compute_cos_sin_cache( + self, + max_position_embeddings: int, + rescale_factors: List[float], + mscale: float, + ) -> torch.Tensor: + inv_freq = self._compute_inv_freq(rescale_factors) + t = torch.arange(max_position_embeddings, dtype=dtypes.fp32) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() * mscale + sin = freqs.sin() * mscale + cache = torch.cat((cos, sin), dim=-1) + return cache + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + + k = self.original_max_position_embeddings + long_prompt_offset = ( + torch.any(positions > k).float() * torch.full_like(positions, k) + ).long() + idx = ( + torch.add(positions, long_prompt_offset) + if long_prompt_offset is not None + else positions + ) + self.long_short_cos_sin_cache: torch.Tensor = self.long_short_cos_sin_cache.to( + idx.device + ) + idx = torch.add(idx, offsets) if offsets is not None else idx + cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx) + + cos, sin = cos_sin.chunk(2, dim=-1) + cos = cos.repeat(1, 2).unsqueeze(-2) + sin = sin.repeat(1, 2).unsqueeze(-2) + + query = query * cos + _rotate_neox(query) * sin + key = key * cos + _rotate_neox(key) * sin + + return query.flatten(-2), key.flatten(-2) + + +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +class DeepseekScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + mscale: float = 1, + mscale_all_dim: float = 0, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation. + self.mscale = float( + yarn_get_mscale(self.scaling_factor, float(mscale)) + / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) + * attn_factor + ) + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base ** ( + torch.arange(0, self.rotary_dim, 2, dtype=dtypes.fp32, device="cuda") + / self.rotary_dim + ) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = _yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + self.rotary_dim, + self.base, + self.max_position_embeddings, + ) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = ( + 1 + - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=dtypes.fp32) + ) * self.extrapolation_factor + inv_freq = ( + inv_freq_interpolation * (1 - inv_freq_mask) + + inv_freq_extrapolation * inv_freq_mask + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange( + self.max_position_embeddings * self.scaling_factor, + device="cuda", + dtype=dtypes.fp32, + ) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() * self.mscale + sin = freqs.sin() * self.mscale + cos = freqs.cos().unsqueeze(-2).unsqueeze(-2) + sin = freqs.sin().unsqueeze(-2).unsqueeze(-2) + return cos, sin + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + query, key = super().forward(positions, query, key, offsets) + if positions.numel() == 1: + key = key.clone() + return query, key + + +class Llama3RotaryEmbedding(RotaryEmbedding): + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + scaling_factor: float, + low_freq_factor: float, + high_freq_factor: float, + orig_max_position: int, + ) -> None: + self.scaling_factor = scaling_factor + self.low_freq_factor = low_freq_factor + self.high_freq_factor = high_freq_factor + self.orig_max_position = orig_max_position + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + inv_freqs = super()._compute_inv_freq(base) + low_freq_wavelen = self.orig_max_position / self.low_freq_factor + high_freq_wavelen = self.orig_max_position / self.high_freq_factor + + wave_len = 2 * math.pi / inv_freqs + if self.low_freq_factor != self.high_freq_factor: + smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / ( + self.high_freq_factor - self.low_freq_factor + ) + else: + smooth = 0 + new_freqs = torch.where( + wave_len < high_freq_wavelen, + inv_freqs, + torch.where( + wave_len > low_freq_wavelen, + inv_freqs / self.scaling_factor, + (1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs, + ), + ) + return new_freqs + + +class MRotaryEmbedding(RotaryEmbedding): + """Rotary Embedding with Multimodal Sections.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + mrope_section: Optional[List[int]] = None, + ) -> None: + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + self.mrope_section = mrope_section + if self.mrope_section: + assert sum(self.mrope_section) == rotary_dim // 2 + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward(). + + Args: + positions: + [num_tokens,] (text only) or + [3, num_tokens] (T/H/W positions with multimodal inputs) + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + """ + assert positions.ndim == 1 or positions.ndim == 2 + + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if positions.ndim == 2: + assert self.mrope_section + + cos = torch.cat( + [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], + dim=-1, + ) + sin = torch.cat( + [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], + dim=-1, + ) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., : self.rotary_dim] + query_pass = query[..., self.rotary_dim :] + query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., : self.rotary_dim] + key_pass = key[..., self.rotary_dim :] + key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + @staticmethod + def get_input_positions( + input_tokens: List[int], + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + image_token_id: int, + video_token_id: int, + vision_start_token_id: int, + vision_end_token_id: int, + spatial_merge_size: int, + context_len: int = 0, + ) -> Tuple[List[List[int]], int]: + """Get mrope input positions and delta value.""" + + if isinstance(image_grid_thw, torch.Tensor): + image_grid_thw = image_grid_thw.tolist() + if isinstance(video_grid_thw, torch.Tensor): + video_grid_thw = video_grid_thw.tolist() + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id + ).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + t_index = ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:] + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + + return llm_positions.tolist(), mrope_position_delta + + @staticmethod + def get_next_input_positions( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> List[List[int]]: + return [ + list( + range( + context_len + mrope_position_delta, seq_len + mrope_position_delta + ) + ) + for _ in range(3) + ] + + +@dataclass +class AiterFusedSetKVBufferArg: + kv_cache: Tuple[torch.Tensor, torch.Tensor] + cache_loc: torch.Tensor + k_scale: torch.Tensor + v_scale: torch.Tensor + return_kv: bool = False # Whether to return k_out and v_out + use_shuffle_layout: bool = False # Whether to use shuffle layout for KV cache + block_size: int = 0 # Block size for shuffle layout + x: int = 0 # x parameter for shuffle layout (16 // element_size) + + +class RotaryEmbeddingFusedQKNorm(nn.Module): + """Rotary Embedding with QKNorm fused""" + + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / ( + base + ** ( + torch.arange(0, self.rotary_dim, 2, dtype=dtypes.fp32) / self.rotary_dim + ) + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + t = torch.arange(self.max_position_embeddings, dtype=dtypes.fp32) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + return cos, sin + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + assert self.head_size == self.rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.dtype = dtype + + cos, sin = self._compute_cos_sin_cache() + cos = cos.to(dtype) + sin = sin.to(dtype) + cache = torch.cat((cos, sin), dim=-1) + self.cos_sin_cache: torch.Tensor + self.register_buffer("cos_sin_cache", cache, persistent=False) + + def forward( + self, + qkv: torch.Tensor, + q_weight: torch.Tensor, + k_weight: torch.Tensor, + positions: torch.Tensor, + num_heads: int, + num_kv_heads: int, + eps: float, + fused_set_kv_buffer_arg: Optional[AiterFusedSetKVBufferArg] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + assert positions.ndim == 1 + num_tokens = positions.shape[0] + num_heads_q = num_heads + num_heads_k = num_kv_heads + num_heads_v = num_kv_heads + if fused_set_kv_buffer_arg is not None: + q_out = torch.empty( + num_tokens, + num_heads_q, + self.head_size, + dtype=qkv.dtype, + device=qkv.device, + ) + # Create k_out and v_out buffers for varlen format output + return_kv = fused_set_kv_buffer_arg.return_kv + kv_cache_dtype = fused_set_kv_buffer_arg.kv_cache[0].dtype + k_out = ( + torch.empty( + num_tokens, + num_heads_k, + self.head_size, + dtype=kv_cache_dtype, + device=qkv.device, + ) + if return_kv + else None + ) + v_out = ( + torch.empty( + num_tokens, + num_heads_v, + self.head_size, + dtype=kv_cache_dtype, + device=qkv.device, + ) + if return_kv + else None + ) + fused_qk_norm_rope_cache_pts_quant_shuffle( + qkv, + q_weight, + k_weight, + self.cos_sin_cache, + positions, + num_tokens, + num_heads_q, + num_heads_k, + num_heads_v, + self.head_size, + self.is_neox_style, + eps, + q_out, + fused_set_kv_buffer_arg.kv_cache[0], + fused_set_kv_buffer_arg.kv_cache[1], + fused_set_kv_buffer_arg.cache_loc, + fused_set_kv_buffer_arg.k_scale, + fused_set_kv_buffer_arg.v_scale, + k_out, + v_out, + return_kv, + fused_set_kv_buffer_arg.use_shuffle_layout, + fused_set_kv_buffer_arg.block_size, + fused_set_kv_buffer_arg.x, + ) + if return_kv: + return q_out, k_out, v_out + else: + return q_out, None, None + else: + raise NotImplementedError("fused_rope_rms not supported yet") + # fused_rope_rms( + # qkv, + # q_weight, + # k_weight, + # self.cos_sin_cache, + # positions, + # num_tokens, + # num_heads_q, + # num_heads_k, + # num_heads_v, + # self.head_size, + # self.is_neox_style, + # eps, + # ) + q_size = num_heads_q * self.head_size + k_size = num_heads_k * self.head_size + v_size = num_heads_v * self.head_size + + qkv = qkv.view(num_tokens, q_size + k_size + v_size) + q, k, v = qkv.split([q_size, k_size, v_size], dim=-1) + + return q, k, v + + +class MRotaryEmbeddingQKNormFused(RotaryEmbeddingFusedQKNorm): + """Rotary Embedding with Multimodal Sections fused with QKNorm""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + mrope_section: Optional[List[int]] = None, + mrope_interleaved: bool = False, + ) -> None: + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + self.mrope_interleaved = mrope_interleaved + self.mrope_section = mrope_section + if self.mrope_section: + expected_sum = rotary_dim // 2 + actual_sum = sum(self.mrope_section) + if actual_sum != expected_sum: + print( + f"MRoPE section sum mismatch: expected {expected_sum}, got {actual_sum}. " + f"Adjusting mrope_section to match rotary_dim // 2 = {expected_sum}" + ) + # Auto-correct by scaling the mrope_section proportionally + if actual_sum > 0: + scale_factor = expected_sum / actual_sum + self.mrope_section = [ + max(1, int(section * scale_factor)) + for section in self.mrope_section + ] + # Ensure the sum exactly matches by adjusting the last element + current_sum = sum(self.mrope_section) + if current_sum != expected_sum: + self.mrope_section[-1] += expected_sum - current_sum + else: + # If all sections are 0, create a default distribution + self.mrope_section = [ + expected_sum // len(self.mrope_section) + ] * len(self.mrope_section) + # Handle remainder + remainder = expected_sum % len(self.mrope_section) + for i in range(remainder): + self.mrope_section[i] += 1 + + print( + f"Corrected mrope_section: {self.mrope_section} (sum={sum(self.mrope_section)})" + ) + + def forward( + self, + qkv: torch.Tensor, + q_weight: torch.Tensor, + k_weight: torch.Tensor, + positions: torch.Tensor, + num_heads: int, + num_kv_heads: int, + eps: float, + fused_set_kv_buffer_arg: Optional[AiterFusedSetKVBufferArg] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + assert positions.ndim == 2 + num_tokens = positions.shape[-1] + num_heads_q = num_heads + num_heads_k = num_kv_heads + num_heads_v = num_kv_heads + is_interleaved = ( + True if positions.ndim == 2 and self.mrope_section is not None else False + ) + assert is_interleaved == self.mrope_interleaved + if fused_set_kv_buffer_arg is not None: + q_out = torch.empty( + num_tokens, + num_heads_q, + self.head_size, + dtype=qkv.dtype, + device=qkv.device, + ) + # Create k_out and v_out buffers for varlen format output + return_kv = fused_set_kv_buffer_arg.return_kv + kv_cache_dtype = fused_set_kv_buffer_arg.kv_cache[0].dtype + k_out = ( + torch.empty( + num_tokens, + num_heads_k, + self.head_size, + dtype=kv_cache_dtype, + device=qkv.device, + ) + if return_kv + else None + ) + v_out = ( + torch.empty( + num_tokens, + num_heads_v, + self.head_size, + dtype=kv_cache_dtype, + device=qkv.device, + ) + if return_kv + else None + ) + fused_qk_norm_mrope_3d_cache_pts_quant_shuffle( + qkv, + q_weight, + k_weight, + self.cos_sin_cache, + positions, + num_tokens, + num_heads_q, + num_heads_k, + num_heads_v, + self.head_size, + self.is_neox_style, + self.mrope_section, + is_interleaved, + eps, + q_out, + fused_set_kv_buffer_arg.kv_cache[0], + fused_set_kv_buffer_arg.kv_cache[1], + fused_set_kv_buffer_arg.cache_loc, + fused_set_kv_buffer_arg.k_scale, + fused_set_kv_buffer_arg.v_scale, + k_out, + v_out, + return_kv, + fused_set_kv_buffer_arg.use_shuffle_layout, + fused_set_kv_buffer_arg.block_size, + fused_set_kv_buffer_arg.x, + ) + if return_kv: + return q_out, k_out, v_out + else: + return q_out, None, None + else: + raise NotImplementedError("fused_mrope_3d_rms not supported yet") + # fused_mrope_3d_rms( + # qkv, + # q_weight, + # k_weight, + # self.cos_sin_cache, + # positions, + # num_tokens, + # num_heads_q, + # num_heads_k, + # num_heads_v, + # self.head_size, + # self.is_neox_style, + # self.mrope_section, + # is_interleaved, + # eps, + # ) + q_size = num_heads_q * self.head_size + k_size = num_heads_k * self.head_size + v_size = num_heads_v * self.head_size + + qkv = qkv.view(num_tokens, q_size + k_size + v_size) + q, k, v = qkv.split([q_size, k_size, v_size], dim=-1) + + return q, k, v + + +class DualChunkRotaryEmbedding(nn.Module): + """Rotary positional embedding for Dual Chunk Attention.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + chunk_size: int, + local_size: int, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.chunk_size = chunk_size + self.local_size = local_size + self.dtype = dtype + self.device = torch.device(f"cuda:{torch.cuda.current_device()}") + q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache = ( + self._compute_cos_sin_cache() + ) + + self.register_buffer("cos_sin_q_cache", q_cache, persistent=False) + self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False) + self.register_buffer("cos_sin_k_cache", k_cache, persistent=False) + self.register_buffer( + "cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False + ) + self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False) + + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. + # However, we use `torch.arange(..., dtype=torch.float)` instead to + # avoid numerical issues with large base values (e.g., 10000000). + # This may cause a slight numerical difference between the HF + # implementation and ours. + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / ( + base + ** ( + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim + ) + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + chunk_len = self.chunk_size - self.local_size + q_t = torch.arange(chunk_len, dtype=torch.float) + qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp( + max=self.chunk_size + ) + k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len + + # count from chunk_len, no clamp(self.chunk_size) restriction + qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len + # count from self.chunk_size for q_inter's rope + q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size + + q_freqs = torch.outer(q_t, inv_freq) + qc_freqs = torch.outer(qc_t, inv_freq) + k_freqs = torch.outer(k_t, inv_freq) + qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq) + q_inter_freqs = torch.outer(q_inter_t, inv_freq) + + q_cos = q_freqs.cos() + q_sin = q_freqs.sin() + qc_cos = qc_freqs.cos() + qc_sin = qc_freqs.sin() + k_cos = k_freqs.cos() + k_sin = k_freqs.sin() + + qc_no_clamp_cos = qc_no_clamp_freqs.cos() + qc_no_clamp_sin = qc_no_clamp_freqs.sin() + q_inter_cos = q_inter_freqs.cos() + q_inter_sin = q_inter_freqs.sin() + + q_cache = torch.cat((q_cos, q_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + k_cache = torch.cat((k_cos, k_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + query_rot = query[..., : self.rotary_dim] + key_rot = key[..., : self.rotary_dim] + if self.rotary_dim < self.head_size: + query_pass = query[..., self.rotary_dim :] + key_pass = key[..., self.rotary_dim :] + else: + query_pass = None + key_pass = None + + positions_with_offsets = ( + torch.add(positions, offsets) if offsets is not None else positions + ) + key = self._apply_rotary_embedding( + self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass + ) + chunk_len = self.chunk_size - self.local_size + query = self._apply_rotary_embedding( + self.cos_sin_q_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + query_succ = self._apply_rotary_embedding( + self.cos_sin_qc_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + query_inter = self._apply_rotary_embedding( + self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1), + query_rot, + query_pass, + ) + query_succ_critical = self._apply_rotary_embedding( + self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + query_inter_critical = self._apply_rotary_embedding( + self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + + # merge query into one tensor to simplify the interfaces + query = torch.cat( + ( + query, + query_succ, + query_inter, + query_succ_critical, + query_inter_critical, + ), + dim=-1, + ) + return query, key + + def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass): + cos, sin = cos_sin.chunk(2, dim=-1) + if self.is_neox_style: + # NOTE(woosuk): Here we assume that the positions tensor has the + # shape [batch_size, seq_len]. + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj + hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin + + if self.rotary_dim < self.head_size: + hidden = torch.cat((hidden_rot, hidden_pass), dim=-1) + else: + hidden = hidden_rot + return hidden.flatten(-2).squeeze(0) + + def extra_repr(self) -> str: + s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" + s += f", max_position_embeddings={self.max_position_embeddings}" + s += f", base={self.base}, is_neox_style={self.is_neox_style}" + s += f", chunk_size={self.chunk_size}, local_size={self.local_size}" + return s + + +_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} + + +def get_rope( + head_size: int, + rotary_dim: int, + max_position: int, + base: int, + is_neox_style: bool = True, + rope_scaling: Optional[Dict[str, Any]] = None, + dtype: Optional[torch.dtype] = None, + partial_rotary_factor: float = 1.0, +) -> RotaryEmbedding: + if dtype is None: + dtype = torch.get_default_dtype() + if rope_scaling is not None: + # Transforms every value that is a list into a tuple for caching calls + rope_scaling_tuple = { + k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items() + } + rope_scaling_args = tuple(rope_scaling_tuple.items()) + else: + rope_scaling_args = None + if partial_rotary_factor < 1.0: + rotary_dim = int(rotary_dim * partial_rotary_factor) + key = ( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + rope_scaling_args, + dtype, + ) + if key in _ROPE_DICT: + return _ROPE_DICT[key] + + if rope_scaling is None: + rotary_emb = RotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, dtype + ) + else: + scaling_type = ( + rope_scaling["rope_type"] + if "rope_type" in rope_scaling + else rope_scaling["type"] + ) + # The correct one should be "longrope" but keep "su" here + # for backward compatible + if scaling_type not in {"su", "longrope"}: + scaling_factor = rope_scaling.get("factor", 1.0) + if scaling_type == "llama3": + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + original_max_position = rope_scaling["original_max_position_embeddings"] + rotary_emb = Llama3RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + scaling_factor, + low_freq_factor, + high_freq_factor, + original_max_position, + ) + elif scaling_type == "linear": + rotary_emb = LinearScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + elif scaling_type == "dynamic": + rotary_emb = DynamicNTKScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + elif scaling_type == "yarn": + original_max_position = rope_scaling["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k + in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow") + } + rotary_emb = YaRNScalingRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + scaling_factor, + dtype, + **extra_kwargs, + ) + elif scaling_type == "deepseek_yarn": + original_max_position = rope_scaling["original_max_position_embeddings"] + # assert max_position == original_max_position * scaling_factor + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k + in ( + "extrapolation_factor", + "attn_factor", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", + ) + } + rotary_emb = DeepseekScalingRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + scaling_factor, + dtype, + **extra_kwargs, + ) + # The correct one should be "longrope" but keep "su" here + # for backward compatible + elif scaling_type == "su" or scaling_type == "longrope": + short_factor = rope_scaling["short_factor"] + long_factor = rope_scaling["long_factor"] + original_max_position = rope_scaling["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("short_mscale", "long_mscale") + } + rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( + head_size, + rotary_dim, + max_position, + original_max_position, + base, + is_neox_style, + dtype, + short_factor, + long_factor, + **extra_kwargs, + ) + elif scaling_type == "mrope": + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_scaling["mrope_section"], + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + _ROPE_DICT[key] = rotary_emb + return rotary_emb + + +def get_rope_wrapper( + head_size: int, + rotary_dim: int, + max_position: int, + base: int, + is_neox_style: bool = True, + rope_scaling: Optional[Dict[str, Any]] = None, + dtype: Optional[torch.dtype] = None, + partial_rotary_factor: float = 1.0, + device: Optional[str] = None, +): + if device != "cpu": + return get_rope( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + rope_scaling, + dtype, + partial_rotary_factor, + ) + + assert False, "get_rope_wrapper in AITER is not implemented for cpu device" diff --git a/aiter/test_common.py b/aiter/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..84d092f264824e34d317c353f4ddf3562456934d --- /dev/null +++ b/aiter/test_common.py @@ -0,0 +1,516 @@ +# SPDX-License-Identifier: MIT +import torch +import torch.profiler as tpf +import os +import copy +import numpy as np +import pandas as pd +from aiter import logger + +pd.set_option("display.max_rows", 200) +## debug ## +# pd.set_option("display.max_rows", None) +# pd.set_option("display.max_columns", None) +# pd.set_option("display.width", None) +# pd.set_option("display.max_colwidth", None) +# pd.set_option("display.expand_frame_repr", False) + + +def perftest( + num_iters=101, num_warmup=2, testGraph=False, num_rotate_args=0, needTrace=False +): + def decorator(func): + def wrapper(*args, **kwargs): + num = num_rotate_args + if num < 1: + gpu_id = torch.cuda.current_device() + iter_used_memory, inputSize, _, _ = device_memory_profiling( + func, *args, **kwargs + ) + + properties = torch.cuda.get_device_properties(gpu_id) + free_memory = torch.cuda.mem_get_info(gpu_id)[0] + cache_size = min( + getattr(properties, "L2_cache_size", 4096 * 1024) * 64 * 128, + (free_memory - iter_used_memory + inputSize) * 0.9, + ) + cache_size = max(cache_size, 0) + num = int((cache_size + inputSize - 1) // inputSize) + num = min(num, num_iters) + + rotate_args = [ + (copy.deepcopy(args), copy.deepcopy(kwargs)) for _ in range(num - 1) + ] + [(args, kwargs)] + run_iters(num_warmup, func, *args, **kwargs) + torch.cuda.synchronize() + if int(os.environ.get("AITER_LOG_MORE", 0)): + latencies = [] + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + for _ in range(num_iters): + start_event.record() + data = func(*args, **kwargs) + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = np.mean(latencies) * 1000 + logger.info(f"avg: {avg} us/iter from cuda.Event") + if testGraph: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + data = run_iters_rotate(num_iters, func, rotate_args) + with tpf.profile( + activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA], + profile_memory=True, + with_stack=True, + with_modules=True, + ) as prof: + run_iters(1, graph.replay) + avg = get_trace_perf(prof, num_iters) + logger.info(f"avg: {avg} us/iter with hipgraph") + with tpf.profile( + activities=[tpf.ProfilerActivity.CPU, tpf.ProfilerActivity.CUDA], + profile_memory=False, + with_stack=False, + with_modules=True, + # record_shapes=True, + on_trace_ready=( + tpf.tensorboard_trace_handler(f"./aiter_logs/gpu_id_{gpu_id}") + if needTrace + else None + ), + ) as prof: + data = run_iters_rotate(num_iters, func, rotate_args) + torch.cuda.synchronize() + torch.cuda.empty_cache() + + avg = get_trace_perf(prof, num_iters) + return data, avg + + return wrapper + + return decorator + + +def perf_func(func, iters, warmup_iters): + start_event = torch.cuda.Event(enable_timing=True) + stop_event = torch.cuda.Event(enable_timing=True) + for n in range(iters + warmup_iters): + if n == warmup_iters: + start_event.record() + output = func() + stop_event.record() + start_event.wait() + stop_event.wait() + torch.cuda.current_stream().synchronize() + duration_ms = start_event.elapsed_time(stop_event) + return output, duration_ms / iters + +# performance test for distributed case +def perftest_dist( + num_iters=101, num_warmup=2, testGraph=False, num_rotate_args=0, needTrace=False +): + def decorator(func): + def wrapper(*args, **kwargs): + num = num_rotate_args + if num < 1: + gpu_id = torch.cuda.current_device() + + iter_used_memory, inputSize, _, _ = device_memory_profiling( + func, *args, **kwargs + ) + + properties = torch.cuda.get_device_properties(gpu_id) + free_memory = torch.cuda.mem_get_info(gpu_id)[0] + cache_size = min( + getattr(properties, "L2_cache_size", 4096 * 1024) * 64 * 128, + (free_memory - iter_used_memory + inputSize) * 0.9, + ) + cache_size = max(cache_size, 0) + num = int((cache_size + inputSize - 1) // inputSize) + # print(f"{iter_used_memory=}, {inputSize=}, {cache_size=}, {free_memory=}, {num=}") + num = min(num, num_iters) + + rotate_args = [ + (copy.deepcopy(args), copy.deepcopy(kwargs)) for _ in range(num - 1) + ] + [(args, kwargs)] + + run_iters(num_warmup, func, *args, **kwargs) + torch.cuda.synchronize() + dist.barrier() + if int(os.environ.get("AITER_LOG_MORE", 0)): + output_list = [] + latencies = [] + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + for _ in range(num_iters): + start_event.record() + data = func(*args, **kwargs, iter=_) + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + output_list.append(data) + avg = np.mean(latencies) * 1000 + # logger.info(f"avg: {avg} us/iter from cuda.Event") + with tpf.profile( + activities=[ + tpf.ProfilerActivity.CUDA], + # profile_memory=True, + with_stack=False, + # with_modules=True, + record_shapes=False, + # on_trace_ready = ( + # tpf.tensorboard_trace_handler("./aiter_logs/") + # if needTrace else None), + ) as prof: + # data = run_iters_rotate(num_iters, func, rotate_args) + # output_list, _ = perf_func(partial(func, *args), iters=num_iters, warmup_iters=num_warmup) + output_list = [] + for _ in range(num_iters): + data = func(*args, **kwargs, iter=_) + # output_list.append(data) + df = get_trace_perf(prof, num_iters) + return output_list, df + + return wrapper + + return decorator + +def benchmark(): + def decorator(func): + def wrapper(*args, **kwargs): + callargs = log_args(func, *args, **kwargs) + ret = func(*args, **kwargs) + if ret is not None: + callargs.update(ret) + return callargs + + return wrapper + + return decorator + + +def device_memory_profiling(func, *args, **kwargs): + gpu_id = torch.cuda.current_device() + inputSize = ( + sum( + [ + el.nbytes + for el in args + if isinstance(el, torch.Tensor) and el.device.index == gpu_id + ] + ) + + 1 + ) + torch.cuda.reset_peak_memory_stats(gpu_id) + cuda_memory_before = ( + torch.cuda.mem_get_info(gpu_id)[1] - torch.cuda.mem_get_info(gpu_id)[0] + ) + torch_memory_before = torch.cuda.memory_reserved(gpu_id) + torch_peak_before = torch.cuda.memory_stats(gpu_id).get( + "allocated_bytes.all.peak", 0 + ) + non_torch_memory_before = cuda_memory_before - torch_memory_before + + data = func(*args, **kwargs) + + torch.cuda.reset_peak_memory_stats(gpu_id) + cuda_memory_after = ( + torch.cuda.mem_get_info(gpu_id)[1] - torch.cuda.mem_get_info(gpu_id)[0] + ) + torch_memory_after = torch.cuda.memory_reserved(gpu_id) + torch_peak_after = torch.cuda.memory_stats(gpu_id).get( + "allocated_bytes.all.peak", 0 + ) + non_torch_memory_after = cuda_memory_after - torch_memory_after + + torch_peak_increase = torch_peak_after - torch_peak_before + non_torch_increase = non_torch_memory_after - non_torch_memory_before + iter_used_memory = torch_peak_increase + non_torch_increase + inputSize + + return iter_used_memory, inputSize, torch_peak_increase, non_torch_increase + + +def run_iters(num_iters, func, *args, **kwargs): + data = None + for _ in range(num_iters): + data = func(*args, **kwargs) + return data + + +def run_iters_rotate(num_iters, func, rotate_args): + data = None + num_rotate_args = len(rotate_args) + for _ in range(num_iters): + args, kwargs = rotate_args[_ % num_rotate_args] + data = func(*args, **kwargs) + + return data + + +def run_perftest( + func, + *args, + num_iters=101, + num_warmup=2, + testGraph=False, + num_rotate_args=0, + needTrace=False, + **kwargs, +): + + @perftest( + num_iters=num_iters, + num_warmup=num_warmup, + testGraph=testGraph, + num_rotate_args=num_rotate_args, + needTrace=needTrace, + ) + def worker(*args, **kwargs): + return func(*args, **kwargs) + + return worker(*args, **kwargs) + + +def log_args(func, *args, **kwargs): + import inspect + + callargs = inspect.getcallargs(func, *args, **kwargs) + + prefix = f"calling {func.__name__}(" + blanks = " " * (len(prefix)) + + def getTensorInfo(el): + if isinstance(el, torch.Tensor): + return f"{el.shape} {el.dtype} {el.device} {hex(el.data_ptr())}" + elif isinstance(el, tuple): + viewNum = 5 + if len(el) > viewNum: + el = list(el[:viewNum]) + ["..."] + return f'\n{" "*(len(prefix)+31)}'.join( + ["("] + [f" {getTensorInfo(e)}" for e in el] + [")"] + ) + return el + + info = [f"{el:<28} = {getTensorInfo(callargs[el])}" for el in callargs] + info = f",\n{blanks}".join(info) + logger.info(f"\n{prefix}{info})") + return callargs + + +def post_process_data(df, num_iters, warm_iter=1): + """remove abnormal data""" + + device_df = df[df["device_type"].astype(str).str.contains("DeviceType.CUDA")] + # print("devicedf is ", device_df) + if device_df.empty: + return [], 0 + kernels_num = int(len(device_df) / num_iters) + + act_iters = num_iters + valid_n = len(device_df) + dropped_indexs = [] + if len(device_df) % num_iters == 0: + kernels_num = int(len(device_df) / num_iters) + else: + ##get correct kernel num + name_list = device_df["name"].tolist() + max_kernel_num = 20 + n = len(name_list) + for step in range(1, min(max_kernel_num, n // 2 + 1)): + sub_list = [name_list[i] for i in range(step)] + m = len(sub_list) + + valid_n = int(n / m) * m + pattern_match = all( + name_list[i] == sub_list[i % m] for i in range(int(n / m) * m) + ) + if pattern_match: + kernels_num = m + act_iters = valid_n / m + break + dropped_indexs = device_df.iloc[valid_n:].index.tolist() + if kernels_num == 0: + print("data missed, the time may be inaccurate!") + + test_df = device_df.iloc[:valid_n].reset_index() + grouped_kernel_df = test_df.groupby(test_df.index // kernels_num, sort=False).agg( + {"self_device_time_total": "sum", "index": list} + ) + + # rm warm iters + sum_df = grouped_kernel_df.iloc[warm_iter:].reset_index(drop=True) + out_range_idx = [] + if num_iters > 30: + # IQR to remove abnormal data + k = 1.5 + Q1 = sum_df["self_device_time_total"].quantile(0.25) + Q3 = sum_df["self_device_time_total"].quantile(0.75) + IQR = Q3 - Q1 + lower = Q1 - k * IQR + upper = Q3 + k * IQR + out_range_idx = sum_df.index[ + (sum_df["self_device_time_total"] < lower) + | (sum_df["self_device_time_total"] > upper) + ].tolist() + out_range_num = len(out_range_idx) + + indices = {idx for i in out_range_idx for idx in sum_df.iloc[i]["index"]} + + index_sublists = grouped_kernel_df["index"].head(warm_iter).tolist() + indices_to_add = [idx for sublist in index_sublists for idx in sublist] + indices.update(indices_to_add) + indices.update(dropped_indexs) + # if int(os.environ.get("AITER_LOG_MORE", 0)): + # logger.info(f"abnormal data indices: {indices}") + # for i in indices: + # logger.info(f"abnormal data: {df.iloc[i]['self_device_time_total']}") + return list(indices), out_range_num + warm_iter + num_iters - act_iters + + +def get_trace_perf(prof, num_iters): + assert num_iters > 1 + warm_iter = 1 + num_iters -= warm_iter + df = [] + cols = [ + "name", + "self_cpu_time_total", + "self_device_time_total", + "device_type", + "device_index", + ] + for el in prof.events(): + df.append([getattr(el, x, None) for x in cols]) + df = pd.DataFrame(df, columns=cols) + ###remove abnormal data + dropped_num = warm_iter + dropped_indexs, dropped_num = post_process_data( + df, num_iters + warm_iter, warm_iter + ) + df = df.drop(dropped_indexs) + iter_init = 0 # warm_iter dropped + df["cnt"] = 1 + rets = [] + + for name, d in df.groupby("name", sort=False): + kernel_num_per_iter = iter_init + if str(d["device_type"].iat[0]).split(".")[-1] != "CUDA": + kernel_num_per_iter = 1 + r = d.iloc[kernel_num_per_iter:][ + ["cnt", "self_cpu_time_total", "self_device_time_total"] + ].sum() + if not r.empty: + device_type = str(d["device_type"].iat[0]).split(".")[-1] + r["name"] = name + r["device_type"] = device_type + r["device_index"] = str(d["device_index"].iat[0]) + if device_type == "CUDA": + r["device_time_sum"] = r["self_device_time_total"] + r["host_time_sum"] = 0 + else: + r["host_time_sum"] = r["self_device_time_total"] + r["device_time_sum"] = 0 + rets.append(r) + df = pd.DataFrame(rets) + cols = [ + "name", + "cnt", + "host_time_sum", + "device_time_sum", + "device_type", + "device_index", + ] + cols = [el for el in cols if el in df.columns] + df = df[(df.host_time_sum > 0) | (df.device_time_sum > 0)] + + timerList = [ + "host_time_sum", + "device_time_sum", + ] + df = df[cols].sort_values(timerList, ignore_index=True) + actual_iters = num_iters + warm_iter - dropped_num + if df.empty: + logger.info("no valida data after post process!") + + avg_name = "[avg us/iter]" + for el in timerList: + if el == "host_time_sum": + df.at[avg_name, el] = df[el].sum() / num_iters + else: + df.at[avg_name, el] = df[el].sum() / actual_iters + if int(os.environ.get("AITER_LOG_MORE", 0)): + pd.set_option("display.expand_frame_repr", False) + pd.set_option("display.max_colwidth", 90) + pd.set_option("display.float_format", "{:,.1f}".format) + logger.info(f"{df}") + return df.at[avg_name, "device_time_sum"] + + +def checkAllclose(a, b, rtol=1e-2, atol=1e-2, tol_err_ratio=0.05, msg="", printNum=8, printLog=True, perfModel=False): + if perfModel: + a = a.to("cpu").to(torch.float32) + b = b.to("cpu").to(torch.float32) + + isClose = torch.isclose(a, b, rtol=rtol, atol=atol) + # mask = (~isClose).to("cpu") + if isClose.all(): + if printLog: + logger.info(f"{msg}[checkAllclose {atol=} {rtol=} \033[32mpassed~\033[0m]") + return 0 + else: + try: + mask = ~isClose + num = mask.sum() + printNum = min(printNum, num) + percent = (num / a.numel()).item() + if not printLog: + return percent + a_msked = a[mask] + b_msked = b[mask] + delta = (a_msked - b_msked).abs() + except RuntimeError as e: + mask = ~isClose.to("cpu") + num = mask.sum() + printNum = min(printNum, num) + percent = (num / a.numel()).item() + if not printLog: + return percent + a_msked = a[mask] + b_msked = b[mask] + delta = (a_msked - b_msked).abs() + if percent > tol_err_ratio: + logger.info( + f"""{msg}[checkAllclose {atol=} {rtol=} \033[31mfailed!\033[0m] + a : {a.shape} + {a_msked[:printNum]} + b : {b.shape} + {b_msked[:printNum]} + delta: + {delta[:printNum]}""" + ) + else: + logger.info( + f"""{msg}[checkAllclose {atol=} {rtol=} \033[33mwarning!\033[0m] a and b results are not all close""" + ) + logger.info( + f"-->max abs delta:{delta.max()}, delta details: {percent:.1%} ({num} of {a.numel()}) elements" + ) + return percent + + +def tensor_dump(x: torch.tensor, name: str, dir="./"): + x_cpu = x.cpu().view(torch.uint8) + filename = f"{dir}/{name}.bin" + x_cpu.numpy().tofile(filename) + logger.info(f"saving {filename} {x.shape}, {x.dtype}") + + with open(f"{dir}/{name}.meta", "w") as f: + f.writelines([f"{el}\n" for el in [x.shape, x.dtype]]) + + +def tensor_load(filename: str): + DWs = np.fromfile(filename, dtype=np.uint32) + metafile = ".".join(filename.split(".")[:-1]) + ".meta" + shape, dtype = [eval(line.strip()) for line in open(metafile)] + return torch.tensor(DWs).view(dtype).view(shape) diff --git a/aiter/test_mha_common.py b/aiter/test_mha_common.py new file mode 100644 index 0000000000000000000000000000000000000000..665c1dd70ac1e1c3b47d19821c84bc80f5129940 --- /dev/null +++ b/aiter/test_mha_common.py @@ -0,0 +1,442 @@ +# SPDX-License-Identifier: MIT +import math +from .bert_padding import pad_input, unpad_input +from einops import rearrange, repeat +import torch +import torch.nn.functional as F +from aiter import dtypes + + +def ck_randval_to_dropout_mask(randval, p): + # If p = 0.3, randval in 255 * (0.7, 1.0] will be dropout + # randval in 255 * [0, 0.7] will be kept + # If return dropout_mask >=0, value will be kept + return math.floor(255.0 * (1 - p)) - randval.to(dtypes.fp32) + + +def convert_flash_attn_S_to_softmax( + S, + seqlen_q, + seqlen_k, + query_padding_mask, + key_padding_mask, + head_dim, + is_dropout, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """FlashAttention stores the S matrix in a different way. + Arguments: + S: (batch_size, nheads, seqlen_q_rounded, seqlen_k_rounded) + query_padding_mask: (batch_size, seqlen_q_rounded) + key_padding_mask: (batch_size, seqlen_k_rounded) + """ + if causal: + window_size = (window_size[0], 0) + seqlen_q_rounded, seqlen_k_rounded = S.shape[-2:] + S_converted = S + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + S.device, + ) + local_mask = F.pad( + local_mask, + (0, seqlen_k_rounded - seqlen_k, 0, seqlen_q_rounded - seqlen_q), + value=True, + ) + S_converted = S_converted.masked_fill(local_mask, 0.0) + + # Need to zero out things not in attention_mask in case S was initialized with random values + # and some of those values aren't overwritten. + seqlen_q_og = ( + query_padding_mask.shape[-1] + if query_padding_mask is not None + else seqlen_q_rounded + ) + if query_padding_mask is not None: + query_padding_mask = F.pad( + query_padding_mask, (0, seqlen_q_rounded - seqlen_q_og) + ) + S_converted = S_converted.masked_fill( + rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0 + ) + seqlen_k_og = ( + key_padding_mask.shape[-1] if key_padding_mask is not None else seqlen_k + ) + if key_padding_mask is not None: + key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k_rounded - seqlen_k_og)) + S_converted = S_converted.masked_fill( + rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0 + ) + S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q_rounded)) + S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded)) + return S_converted[:, :, :seqlen_q, :seqlen_k] + + +def pad_rearrange_dropout_mask_hts_to_bhss( + S_dmask, cu_seqlens_q, seqlen_q_rounded, seqlen_k_rounded +): + """pad + rearrange [nheads, total_q, max_seqlen_k] into [b, nheads, seqlen_q_rounded, seqlen_k_rounded] + Arguments: + S_dmask: (nheads, total_q, max_seqlen_k) + cu_seqlens_q: (b + 1) + Output: + S_dmask: (b, nheads, seqlen_q_rounded, seqlen_k_rounded) + """ + batch_size = cu_seqlens_q.numel() - 1 + seqlens_q = torch.roll(cu_seqlens_q, shifts=-1) - cu_seqlens_q + seqlens_q = seqlens_q[0:batch_size].tolist() + S_dmask = torch.split(S_dmask, seqlens_q, dim=1) + # [(nheads, seqlen_q0, max_seqlen_k), (nheads, seqlen_q1, max_seqlen_k), ..., (nheads, seqlen_qb, max_seqlen_k)] + masks = () + for mask in S_dmask: + # (nheads, seqlen_qi, max_seqlen_k) -> (nheads, seqlen_q_rounded, seqlen_k_rounded) + mask = F.pad( + mask, + ( + 0, + seqlen_k_rounded - mask.shape[2], + 0, + seqlen_q_rounded - mask.shape[1], + 0, + 0, + ), + ).unsqueeze(1) + masks = masks + (mask,) + S_dmask = torch.cat(masks, dim=1) + + S_dmask = S_dmask.transpose(0, 1) + return S_dmask + + +def attn_bias_from_alibi_slopes( + slopes, + seqlen_q, + seqlen_k, + query_padding_mask=None, + key_padding_mask=None, + causal=False, + key_leftpad=None, +): + batch, nheads = slopes.shape + device = slopes.device + slopes = rearrange(slopes, "b h -> b h 1 1") + if causal: + return torch.arange(-seqlen_k + 1, 1, device=device, dtype=dtypes.fp32) * slopes + else: + row_idx = rearrange( + torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1" + ) + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + relative_pos = torch.abs(row_idx + sk - sq - col_idx) + return -slopes * relative_pos.to(dtype=slopes.dtype) + + +def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"): + assert mode in ["full", "random", "third"] + if mode == "full": + lengths = torch.full( + (batch_size, 1), max_seqlen, device=device, dtype=dtypes.i32 + ) + elif mode == "random": + lengths = torch.randint( + max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device + ) + elif mode == "third": + lengths = torch.randint( + max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device + ) + padding_mask = ( + repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) + < lengths + ) + return padding_mask + + +def generate_qkv( + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + kvpacked=False, + qkvpacked=False, +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, d) + k: (batch_size, seqlen_k, nheads_k, d) + v: (batch_size, seqlen_k, nheads_k, d_v) + query_padding_mask: (batch_size, seqlen), bool + key_padding_mask: (batch_size, seqlen), bool + """ + assert not (kvpacked and qkvpacked) + batch_size, seqlen_q, nheads, d = q.shape + _, seqlen_k, nheads_k, _ = k.shape + _, _, _, d_v = v.shape + assert k.shape == (batch_size, seqlen_k, nheads_k, d) + assert v.shape == (batch_size, seqlen_k, nheads_k, d_v) + + if query_padding_mask is not None: + q_unpad, indices_q, cu_seqlens_q, max_seqlen_q, _ = unpad_input( + q, query_padding_mask + ) + output_pad_fn = lambda output_unpad: pad_input( + output_unpad, indices_q, batch_size, seqlen_q + ) + else: + q_unpad = rearrange(q, "b s h d -> (b s) h d") + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * seqlen_q, + step=seqlen_q, + dtype=dtypes.i32, + device=q_unpad.device, + ) + max_seqlen_q = seqlen_q + output_pad_fn = lambda output_unpad: rearrange( + output_unpad, "(b s) h d -> b s h d", b=batch_size + ) + + if key_padding_mask is not None: + k_unpad, indices_k, cu_seqlens_k, max_seqlen_k, _ = unpad_input( + k, key_padding_mask + ) + v_unpad, _, _, _, _ = unpad_input(v, key_padding_mask) + else: + k_unpad = rearrange(k, "b s h d -> (b s) h d") + v_unpad = rearrange(v, "b s h d -> (b s) h d") + cu_seqlens_k = torch.arange( + 0, + (batch_size + 1) * seqlen_k, + step=seqlen_k, + dtype=dtypes.i32, + device=k_unpad.device, + ) + max_seqlen_k = seqlen_k + + if qkvpacked: + assert (query_padding_mask == key_padding_mask).all() + assert nheads == nheads_k + assert d == d_v + qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1) + qkv = torch.stack([q, k, v], dim=2) + if query_padding_mask is not None: + dqkv_pad_fn = lambda dqkv_unpad: pad_input( + dqkv_unpad, indices_q, batch_size, seqlen_q + ) + else: + dqkv_pad_fn = lambda dqkv_unpad: rearrange( + dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + qkv_unpad.detach().requires_grad_(), + cu_seqlens_q, + max_seqlen_q, + qkv.detach().requires_grad_(), + output_pad_fn, + dqkv_pad_fn, + ) + elif kvpacked: + assert d == d_v + kv_unpad = torch.stack([k_unpad, v_unpad], dim=1) + kv = torch.stack([k, v], dim=2) + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dkv_pad_fn = lambda dkv_unpad: pad_input( + dkv_unpad, indices_k, batch_size, seqlen_k + ) + else: + dkv_pad_fn = lambda dkv_unpad: rearrange( + dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + q_unpad.detach().requires_grad_(), + kv_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + kv.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dkv_pad_fn, + ) + else: + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dk_pad_fn = lambda dk_unpad: pad_input( + dk_unpad, indices_k, batch_size, seqlen_k + ) + else: + dk_pad_fn = lambda dk_unpad: rearrange( + dk_unpad, "(b s) h d -> b s h d", b=batch_size + ) + return ( + q_unpad.detach().requires_grad_(), + k_unpad.detach().requires_grad_(), + v_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + k.detach().requires_grad_(), + v.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) + + +def construct_local_mask( + seqlen_q, + seqlen_k, + window_size=(-1, -1), # -1 means infinite window size + query_padding_mask=None, + key_padding_mask=None, + device=None, + key_leftpad=None, +): + row_idx = rearrange( + torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1" + ) + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + if window_size[0] < 0: + return col_idx > row_idx + sk - sq + window_size[1] + else: + sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk + return torch.logical_or( + col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk), + col_idx < row_idx + sk - sq - window_size[0], + ) + + +def attention_ref( + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim_q) + k: (batch_size, seqlen_k, nheads_k, head_dim_q) + v: (batch_size, seqlen_k, nheads_k, head_dim_v) + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k) + causal: whether to apply causal masking + window_size: (int, int), left and right window size + upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast + output back to fp16/bf16. + reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.) + without changing the math. This is to estimate the numerical error from operation + reordering. + Output: + output: (batch_size, seqlen_q, nheads, head_dim_v) + attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout + """ + if causal: + window_size = (window_size[0], 0) + dtype_og = q.dtype + if upcast: + q, k, v = q.float(), k.float(), v.float() + seqlen_q, seqlen_k = q.shape[1], k.shape[1] + k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2]) + v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2]) + d = q.shape[-1] + if not reorder_ops: + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + else: + scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d)) + if softcap > 0: + scores = scores / softcap + scores = scores.tanh() + scores = scores * softcap + if key_padding_mask is not None: + scores.masked_fill_( + rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf") + ) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + key_leftpad=key_leftpad, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias + attention = torch.softmax(scores, dim=-1).to(v.dtype) + # Some rows might be completely masked out so we fill them with zero instead of NaN + if window_size[0] >= 0 or window_size[1] >= 0: + attention = attention.masked_fill( + torch.all(local_mask, dim=-1, keepdim=True), 0.0 + ) + # We want to mask here so that the attention matrix doesn't have any NaNs + # Otherwise we'll get NaN in dV + if query_padding_mask is not None: + attention = attention.masked_fill( + rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0 + ) + dropout_scaling = 1.0 / (1 - dropout_p) + # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling + # output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + if dropout_mask is not None: + attention_drop = attention.masked_fill(~dropout_mask, 0.0) + else: + attention_drop = attention + output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling) + if query_padding_mask is not None: + output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0) + return output.to(dtype=dtype_og), attention.to(dtype=dtype_og) diff --git a/aiter/tuned_gemm.py b/aiter/tuned_gemm.py new file mode 100644 index 0000000000000000000000000000000000000000..5094634ebfb51d6a1e9f60eff00093712683b723 --- /dev/null +++ b/aiter/tuned_gemm.py @@ -0,0 +1,316 @@ +""" + +* Copyright (c) 2024, The vLLM team. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +""" + +import os +from pathlib import Path +import functools +import pandas as pd +import torch +import torch.nn.functional as F +from aiter import hipb_create_extension, hipb_mm, getHipblasltKernelName +from aiter import rocb_create_extension, rocb_mm +from aiter import logger, dtypes + +this_dir = os.path.dirname(os.path.abspath(__file__)) + + +class TunedGemm: + + def __init__(self): + self.extensions_created = False + self.save_gemm = int(os.environ.get("AITER_TUNE_GEMM", 0)) + self.untune_path = f"{this_dir}/configs/untuned_gemm.csv" + self.tune_path = f"{this_dir}/configs/tuned_gemm.csv" + self.bestsols = {} + self.solMap = ["torch", "hipblaslt", "rocblas", "skinny"] + self.cu_count = torch.cuda.get_device_properties( + device="cuda" + ).multi_processor_count + + # self.use_skinny = is_hip() and VLLM_USE_ROCM_SKINNY_GEMM and \ + # "gfx1" not in torch.cuda.get_device_properties('cuda').gcnArchName + self.use_skinny = True + + if self.save_gemm == 1: + self.tuned_df = pd.DataFrame( + columns=["M", "N", "K", "bias", "dtype", "outdtype", "scaleAB"] + ) + else: + self.tuned_df = None + + def load_best_sols(self): + if self.tune_path is not None and Path(self.tune_path).is_file(): + self.bestsols = pd.read_csv(self.tune_path) + if len(self.bestsols) > 0 and "kernelName" in self.bestsols.columns: + hipblasltKernelNames = self.bestsols.apply( + lambda s: ( + getHipblasltKernelName(s.solidx) + if s.libtype == "hipblaslt" + else "" + ), + axis=1, + ) + pd.set_option("display.max_colwidth", 100) + assert hipblasltKernelNames.equals( + self.bestsols["kernelName"].fillna("") + ), ( + "error: gradlib tune gemm not match the current environment, need re-tune!!!\n" + + f"differece:\n{pd.concat([self.bestsols[['solidx','kernelName']], hipblasltKernelNames], axis=1)[hipblasltKernelNames != self.bestsols['kernelName'].fillna('')]}" + ) + + def create_ds(self): + df: pd.DataFrame = self.bestsols + solds = {} + for i in range(len(df)): + ds = df.iloc[i] + key = ( + ds["M"], + ds["N"], + ds["K"], + ds["bias"], + ds["dtype"], + ds["outdtype"], + ds["scaleAB"], + ) + if ds["libtype"] == "hipblaslt": + soltype = self.solMap.index(ds["libtype"]) + elif ds["libtype"] == "rocblas": + soltype = self.solMap.index(ds["libtype"]) + solds[key] = (soltype, int(ds["solidx"])) + self.solids = solds + self.solfuncs = [ + self.apply_torch_mm, + self.apply_hipb_mm, + self.apply_rocb_mm, + self.apply_skinny, + ] + + @functools.lru_cache(maxsize=1024) + def query_sol(self, m, n, k, bias, dtype, otype, scaleAB=False): + if dtype == dtypes.fp16 and k % 8 == 0: + if n > 8 and 0 < m <= 4: + return 3, 0 + elif n % 4 == 0 and m == 1 and k <= 8192: + return 3, 1 + soltype, solidx = self.solids.get( + (m, n, k, bias, str(dtype), str(otype), scaleAB), (0, 0) + ) + logger.info( + f"using {soltype=}, {solidx=} for {m=} {n=} {k=} {dtype=} {bias=}, {scaleAB=}" + ) + return soltype, solidx + + def apply_skinny( + self, + inp, + weights, + solidx, + bias=None, + otype=None, + scale_a=None, + scale_b=None, + scale_c=None, + ): + import aiter as ops + + if solidx == 0: + out = torch.empty( + inp.shape[0], weights.shape[0], dtype=inp.dtype, device="cuda" + ) + ops.wvSpltK(weights, inp, out, inp.shape[0], self.cu_count) + elif solidx == 1: + out = torch.empty( + inp.shape[0], weights.shape[0], dtype=inp.dtype, device="cuda" + ) + ops.LLMM1(weights, inp, out, 4) + if bias is not None: + out += bias + return out + + def apply_hipb_mm( + self, + inp, + weights, + solidx, + bias=None, + otype=None, + scale_a=None, + scale_b=None, + scale_c=None, + scale_type=None, + ): + if otype is None: + otype = inp.dtype + return hipb_mm(inp, weights.t(), solidx, bias, otype, scale_a, scale_b, scale_c, scale_type) + + def apply_rocb_mm( + self, + inp, + weights, + solidx, + bias=None, + otype=None, + scale_a=None, + scale_b=None, + scale_c=None, + ): + assert ( + scale_a is None and scale_b is None and scale_c is None + ), "scale_a, scale_b, scale_c must be None for rocblas" + out = rocb_mm(inp, weights.t(), solidx) + if bias is not None: + out = out + bias + return out + + def apply_torch_mm( + self, + inp, + weights, + solidx, + bias=None, + otype=None, + scale_a=None, + scale_b=None, + scale_c=None, + ): + if self.save_gemm == 1: + m, k = inp.shape + n = weights.shape[0] + self.tuned_df = pd.concat( + [ + self.tuned_df, + pd.DataFrame( + { + "M": [m], + "N": [n], + "K": [k], + "bias": [bias is not None], + "dtype": [inp.dtype], + "outdtype": [otype], + "scaleAB": [scale_a is not None or scale_b is not None], + } + ), + ] + ).drop_duplicates() + self.tuned_df.to_csv(self.untune_path, index=False) + if inp.dtype == dtypes.fp8: + if scale_a is None: + scale_a = torch.ones(1, dtype=dtypes.fp32, device=inp.device) + if scale_b is None: + scale_b = torch.ones(1, dtype=dtypes.fp32, device=inp.device) + + try: + out = torch._scaled_mm( + inp, + weights.t(), + out_dtype=otype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias, + ) + except RuntimeError: + out = ( + F.linear(inp.to(dtypes.fp32), weights.to(dtypes.fp32)) + * scale_a + * scale_b + ) + out = (out.to(otype) + bias) if bias is not None else out.to(otype) + return out + out = F.linear(inp, weights, bias) + if otype is not None: + out = out.to(otype) + return out + + def scale_mm( + self, + inp, + weights, + bias=None, + otype=None, + scale_a=None, + scale_b=None, + scale_c=None, + scale_type=None, + ): + # scale_type=0, scalar scale + # scale_type=1, channel scale + # scale_type=2, block scale + if self.extensions_created == False: + hipb_create_extension() + self.extensions_created = True + self.load_best_sols() + self.create_ds() + if inp.dim() >= 3: + assert(False, "not support 3dim input") + inp_view = inp + m, k = inp_view.shape + n = weights.shape[0] + use_bias = bias is not None + out = self.solfuncs[1]( + inp_view, weights, -1, bias, otype, scale_a, scale_b, scale_c, scale_type + ) + return out + + def mm( + self, + inp, + weights, + bias=None, + otype=None, + scale_a=None, + scale_b=None, + scale_c=None, + ): + # F.Linear can take a 3 dimensional input. vllm + # uses this for linear units. However, sampler + # will use torch.matmul with 2 dimensions only + if self.extensions_created == False: + rocb_create_extension() + hipb_create_extension() + self.extensions_created = True + self.load_best_sols() + self.create_ds() + if inp.dim() >= 3: + try: + inp_view = inp.view(-1, inp.size(-1)) + batched = True + except RuntimeError: + return F.linear(inp, weights, bias) + else: + inp_view = inp + batched = False + m, k = inp_view.shape + n = weights.shape[0] + use_bias = bias is not None + soltype, solidx = self.query_sol( + m=m, + n=n, + k=k, + bias=use_bias, + dtype=inp.dtype, + otype=otype if otype is not None else inp.dtype, + scaleAB=scale_a is not None or scale_b is not None, + ) + out = self.solfuncs[soltype]( + inp_view, weights, solidx, bias, otype, scale_a, scale_b, scale_c + ) + if batched: + out = out.view(*inp.shape[:-1], weights.shape[0]) + return out + + +tgemm = TunedGemm() diff --git a/aiter/utility/__init__.py b/aiter/utility/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/aiter/utility/dtypes.py b/aiter/utility/dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..5e094fd604f4c0a20f41b3a1798d60fb979dfef1 --- /dev/null +++ b/aiter/utility/dtypes.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: MIT +import torch +from ..ops.enum import QuantType, ActivationType +from ..jit.utils.chip_info import get_gfx +import argparse + +defaultDtypes = { + + "gfx938": {"fp8": torch.float8_e4m3fn} + +} + +_8bit_fallback = torch.uint8 + + +def get_dtype_fp8(): + return defaultDtypes.get(get_gfx(), {"fp8": _8bit_fallback})["fp8"] + + +i4x2 = getattr(torch, "int4", _8bit_fallback) +fp4x2 = getattr(torch, "float4_e2m1fn_x2", _8bit_fallback) +fp8 = get_dtype_fp8() +fp8_e8m0 = getattr(torch, "float8_e8m0fnu", _8bit_fallback) +fp16 = torch.float16 +bf16 = torch.bfloat16 +fp32 = torch.float32 +u32 = torch.uint32 +i32 = torch.int32 +i16 = torch.int16 +i8 = torch.int8 + +d_dtypes = { + "fp8": fp8, + "fp8_e8m0": fp8_e8m0, + "fp16": fp16, + "bf16": bf16, + "fp32": fp32, + "i4x2": i4x2, + "fp4x2": fp4x2, + "u32": u32, + "i32": i32, + "i16": i16, + "i8": i8, +} + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def str2tuple(v): + try: + parts = v.strip("()").split(",") + + return tuple(int(p.strip()) for p in parts) + except Exception as e: + raise argparse.ArgumentTypeError(f"invalid format of input: {v}") from e + + +def str2Dtype(v): + def _convert(s): + if s.lower() == "none": + return None + elif s in d_dtypes: + return d_dtypes[s] + else: + # Case-insensitive lookup for QuantType + s_lower = s.lower() + for name in dir(QuantType): + if not name.startswith("_") and name.lower() == s_lower: + return getattr(QuantType, name) + raise ValueError(f"'{s}' not in d_dtypes or QuantType") + + try: + parts = [p.strip() for p in v.strip("()").split(",") if p.strip()] + # Return single value if only one element and no comma; otherwise return tuple + if len(parts) == 1 and "," not in v: + return _convert(parts[0]) + return tuple(_convert(p) for p in parts) + except Exception as e: + raise argparse.ArgumentTypeError(f"invalid format of type: {v}") from e + + +def str2ActivationType(s): + """Convert string to ActivationType.""" + return getattr(ActivationType, s.capitalize()) diff --git a/aiter/utility/fp4_utils.py b/aiter/utility/fp4_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed6184b312ac8a9901b606752b5a7ec9608be4c --- /dev/null +++ b/aiter/utility/fp4_utils.py @@ -0,0 +1,342 @@ +# SPDX-License-Identifier: MIT +import torch +from . import dtypes +from torch import Tensor +import triton +import triton.language as tl + + +def fp32_to_fp4_e2m1fn_x2(x): + FP4_EBITS, FP4_MBITS = 2, 1 + x = _f32_to_floatx_unpacked(x.float(), FP4_EBITS, FP4_MBITS) + x = pack_uint4(x) + # x = x.view(dtypes.fp4x2) # to(fp32) for this datatype gives all 0 for torch... + x = x.view(torch.uint8) + return x + + +def down_size(size): + assert size[-1] % 2 == 0, f"{size} last dim not divisible by two" + return (*size[:-1], size[-1] // 2) + + +def pack_uint4(uint8_data) -> torch.Tensor: + # converting to uint8 for operations + shape = uint8_data.shape + assert shape[-1] % 2 == 0 + uint8_data = uint8_data.contiguous().view(-1) + return (uint8_data[1::2] << 4 | uint8_data[::2]).view(down_size(shape)) + + +# copy-pasted from +# https://github.com/pytorch/ao/blob/bc4f51da86956275da7db0da6e420c506df97820/torchao/prototype/custom_fp_utils.py#L27C1-L142C29 +def _n_ones(n: int) -> int: + return (1 << n) - 1 + + +EBITS_F32, MBITS_F32 = 8, 23 +F32_EXP_BIAS = _n_ones(EBITS_F32 - 1) + + +# copy-pasted from +# https://github.com/pytorch/ao/blob/bc4f51da86956275da7db0da6e420c506df97820/torchao/prototype/custom_fp_utils.py#L27C1-L142C29 +def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor: + """Convert FP32 numbers to sub-byte floating point numbers with the given + number of exponent and mantissa bits. + + Input: torch.Tensor of dtype torch.float + Output: torch.Tensor of dtype torch.uint8, where the bit encoding is stored + in the least significant bits. e.g. + fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding + fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding + + Note: there are no special values (NaN, inf) support in this code. Values + outside the representable range of Floatx after rounding are clamped to the + maximum Floatx magnitude (sign is preserved). + + Code below is an adaptation of https://fburl.com/code/ciwofcg4 + + Background 1: last answer in https://stackoverflow.com/q/8981913 + Background 2: Computer Organization and Design, RISC-V edition, Chapter 3.5 + """ + assert x.dtype == torch.float + assert 1 + ebits + mbits <= 8 + + # calculate constants + exp_bias = _n_ones(ebits - 1) + max_int = _n_ones(ebits + mbits) + sign_mask = 1 << (ebits + mbits) + + # TODO document this better + magic_adder = _n_ones(MBITS_F32 - mbits - 1) + + # all E bits and M bits are 1s + max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2**mbits)) + + # E bits = 1, M bits = 0 + min_normal = 2 ** (1 - exp_bias) + + denorm_exp = ( + # exp bias conversion between formats + (F32_EXP_BIAS - exp_bias) + # mantissa length difference between formats + + (MBITS_F32 - mbits) + # add one to encoded exponent for denormalized numbers + + 1 + ) + denorm_mask_int = denorm_exp << MBITS_F32 + + # reinterpret int32 as float32 + denorm_mask_float = torch.tensor(denorm_mask_int, dtype=torch.int32).view( + torch.float32 + ) + + # save the sign + # Note that we have torch.uint32, but some ops like cpu bit shifts + # do not work on it. So, we stay in int32. + x = x.view(torch.int32) + sign = x & 0x80000000 + + # set everything to positive, will add sign back at the end + x = x ^ sign + + # TODO: can the branch floating point comparisons below be done without + # converting to float? probably but need to verify + x = x.view(torch.float) + + # rewrite saturate/denorm/norm branches without explicit data dependent + # control flow, to be more compiler friendly + saturate_mask = x >= max_normal + denormal_mask = torch.logical_and(torch.logical_not(saturate_mask), x < min_normal) + normal_mask = torch.logical_not(torch.logical_or(saturate_mask, denormal_mask)) + + # + # branch 1: saturate to max val - handled later in the code which combines + # the branches + # + + # + # branch 2: to conversion to denormal as well as rounding up to normal + # + denormal_x = x + denorm_mask_float + denormal_x = denormal_x.view(torch.int32) + denormal_x -= denorm_mask_int + denormal_x = denormal_x.to(torch.uint8) + + # + # branch 3: stay in normal range, adjust the exponent and round + # + normal_x = x.view(torch.int32) + # resulting mantissa is odd + mant_odd = (normal_x >> (MBITS_F32 - mbits)) & 1 + # update exponent, rounding bias part 1 + val_to_add = ((exp_bias - F32_EXP_BIAS) << MBITS_F32) + magic_adder + normal_x += val_to_add + # rounding bias part 2 + normal_x += mant_odd + # take the bits! + normal_x = normal_x >> (MBITS_F32 - mbits) + normal_x = normal_x.to(torch.uint8) + + # + # combine the branches + # + x = torch.full_like(x, max_int, dtype=torch.uint8) + x = torch.where(denormal_mask, denormal_x, x) + x = torch.where(normal_mask, normal_x, x) + + # add sign back + sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits) + sign_lp = sign_lp.to(torch.uint8) + # Right shift of a negative signed integer can fill the least significant + # bits with either 1s or 0s, depending on the implementation. Since PyTorch + # doesn't have an uint32 dtype, we mask out these bits to get just the + # f4 sign bit + sign_lp = sign_lp & sign_mask + x = x | sign_lp + + return x.to(torch.uint8) + + +@triton.jit +def _dynamic_mxfp4_quant_kernel_asm_layout( + x_ptr, + x_fp4_ptr, + bs_ptr, + stride_x_m, + stride_x_n, + stride_x_fp4_m, + stride_x_fp4_n, + # stride_bs_m, + # stride_bs_n, + M: tl.constexpr, + N: tl.constexpr, + scaleN: tl.constexpr, + scaleM_pad: tl.constexpr, + scaleN_pad: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, + SCALING_MODE: tl.constexpr, +): + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + + x_offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x_offs_n = pid_n * MXFP4_QUANT_BLOCK_SIZE + tl.arange(0, MXFP4_QUANT_BLOCK_SIZE) + x_offs = x_offs_m[:, None] * stride_x_m + x_offs_n[None, :] * stride_x_n + x_mask = (x_offs_m < M)[:, None] & (x_offs_n < N)[None, :] + x = tl.load(x_ptr + x_offs, mask=x_mask).to(tl.float32) + + # Calculate scale + amax = tl.max(tl.abs(x), axis=1, keep_dims=True) + amax = amax.to(tl.int32, bitcast=True) + amax = (amax + 0x200000).to(tl.uint32, bitcast=True) & 0xFF800000 + amax = amax.to(tl.float32, bitcast=True) + scale_e8m0_unbiased = tl.log2(amax).floor() - 2 + scale_e8m0_unbiased = tl.clamp(scale_e8m0_unbiased, min=-127, max=127) + quant_scale = tl.exp2(-scale_e8m0_unbiased) + + # Compute quantized x + qx = x * quant_scale + + # blockscale_e8m0 + bs_e8m0 = scale_e8m0_unbiased.to(tl.uint8) + 127 + + # Convert quantized fp32 tensor to uint32 before converting to mxfp4 format + # Note: MXFP4 S:1-bit, E:2-bit, M:1-bit + # Zeros: S000 -> +/-0 + # Denormal Numbers: S001 -> +/- 0.5 + # Normal Numbers: + # S010 -> +/- 1.0 + # S011 -> +/- 1.5 + # S100 -> +/- 2.0 + # S101 -> +/- 3.0 + # S110 -> +/- 4.0 + # S111 -> +/- 6.0 + qx = qx.to(tl.uint32, bitcast=True) + + # Extract sign, exponents and mantissa fields from FP32 + s = qx & 0x80000000 + e = (qx >> 23) & 0xFF + m = qx & 0x7FFFFF + + E8_BIAS: tl.constexpr = 127 + E2_BIAS: tl.constexpr = 1 + + # Denormal numbers + # If exponent is less than 127, then it's a denormal number + # See above, for denormal number mantissa is always 1 and we set bit 1 of mantissa + adjusted_exponents = tl.core.sub(E8_BIAS, e + 1, sanitize_overflow=False) + m = tl.where(e < E8_BIAS, (0x400000 | (m >> 1)) >> adjusted_exponents, m) + + # For normal numbers, bias is changed from 127 to 1, and for subnormals, we keep exponent as 0. + # Note: E8_BIAS - E2_BIAS = 126, so for normals we subtract that. + e = tl.maximum(e, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS) + + # Combine sign, exponent, and mantissa, while saturating + # rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right + e2m1_tmp = tl.minimum((((e << 2) | (m >> 21)) + 1) >> 1, 0x7) + e2m1_value = ((s >> 28) | e2m1_tmp).to(tl.uint8) + + e2m1_value = tl.reshape(e2m1_value, [BLOCK_SIZE, MXFP4_QUANT_BLOCK_SIZE // 2, 2]) + evens, odds = tl.split(e2m1_value) + out_tensor = evens | (odds << 4) + + out_offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + out_offs_n = pid_n * MXFP4_QUANT_BLOCK_SIZE // 2 + tl.arange( + 0, MXFP4_QUANT_BLOCK_SIZE // 2 + ) + out_offs = ( + out_offs_m[:, None] * stride_x_fp4_m + out_offs_n[None, :] * stride_x_fp4_n + ) + out_mask = (out_offs_m < M)[:, None] & (out_offs_n < (N // 2))[None, :] + tl.store(x_fp4_ptr + out_offs, out_tensor, mask=out_mask) + + bs_offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + bs_offs_n = pid_n + + bs_offs_0 = bs_offs_m[:, None] // 32 + bs_offs_1 = bs_offs_m[:, None] % 32 + bs_offs_2 = bs_offs_1 % 16 + bs_offs_1 = bs_offs_1 // 16 + bs_offs_3 = bs_offs_n[None, :] // 8 + bs_offs_4 = bs_offs_n[None, :] % 8 + bs_offs_5 = bs_offs_4 % 4 + bs_offs_4 = bs_offs_4 // 4 + bs_offs_6 = bs_offs_5 % 1 + bs_offs = ( + bs_offs_6 + + bs_offs_1 + + bs_offs_4 * 2 + + bs_offs_2 * 2 * 2 + + bs_offs_5 * 2 * 2 * 16 + + bs_offs_3 * 2 * 2 * 16 * 4 + + bs_offs_0 * 2 * 16 * scaleN + ) + bs_mask1 = (bs_offs_m < M)[:, None] & (bs_offs_n < scaleN)[None, :] + bs_mask2 = (bs_offs_m < scaleM_pad)[:, None] & (bs_offs_n < scaleN_pad)[None, :] + bs_e8m0 = tl.where(bs_mask1, bs_e8m0, 127) + tl.store(bs_ptr + bs_offs, bs_e8m0, mask=bs_mask2) + + # bs_offs = bs_offs_m[:, None] * stride_bs_m + bs_offs_n[None, :] * stride_bs_n + # bs_mask = (bs_offs_m < M)[:, None] & (bs_offs_n < N)[None, :] + # tl.store(bs_ptr + bs_offs, bs_e8m0, mask=bs_mask) + + +def dynamic_mxfp4_quant( + x: torch.Tensor, scaling_mode: str = "even" +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize a tensor to MX FP4 format. + + Args: + x: The input tensor, typically fp16 or bf16. + scaling_mode: The method to calculate MX block scaling. + - "even" (default): `even_round` in `quark.torch.quantization.utils`. + - etc. + Returns: + A tuple of (x_fp4, blockscale_e8m0). + """ + # Assume x is 2D-Tensor for now + M, N = x.shape + + assert (N // 2) % 2 == 0 + + # This is fixed by spec for MXFP4. Do not tune this. + # For performance, perhaps, we should look at passing multiple of 32 column blocks + # that a triton program can process + MXFP4_QUANT_BLOCK_SIZE = 32 + + x_fp4 = torch.empty((M, N // 2), dtype=torch.uint8, device=x.device) + scaleM = (M + 31) // 32 * 32 + scaleN_valid = triton.cdiv(N, MXFP4_QUANT_BLOCK_SIZE) + scaleN = triton.cdiv(scaleN_valid, 8) * 8 + blockscale_e8m0 = torch.empty( + ( + scaleM, + scaleN, + ), + dtype=torch.uint8, + device=x.device, + ) + + BLOCK_SIZE = 128 + grid = (triton.cdiv(M, BLOCK_SIZE), scaleN) + _dynamic_mxfp4_quant_kernel_asm_layout[grid]( + x, + x_fp4, + blockscale_e8m0, + *x.stride(), + *x_fp4.stride(), + # *blockscale_e8m0.stride(), + M=M, + N=N, + scaleN=scaleN_valid, + scaleM_pad=scaleM, + scaleN_pad=scaleN, + BLOCK_SIZE=BLOCK_SIZE, + MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, + SCALING_MODE=0, + ) + + return (x_fp4, blockscale_e8m0.view(dtypes.fp8_e8m0)) diff --git a/aiter/utility/mp_tuner.py b/aiter/utility/mp_tuner.py new file mode 100644 index 0000000000000000000000000000000000000000..d0328c3f2c596389ded82e00bdb87c7c68870a65 --- /dev/null +++ b/aiter/utility/mp_tuner.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: MIT +import torch +import multiprocessing as mp +import time +from aiter.test_common import checkAllclose +from aiter import dtypes + +# import traceback + + +def worker( + gpuIDMap, + info, + func, + args, + kwargs, + ref=None, + rtol=1e-2, + atol=1e-2, + printLog=False, + tol_err_ratio=0.05, +): + from aiter.test_common import run_perftest + + pid = mp.current_process().pid + # pid = mp.current_process().pid + # gpuID = gpuIDMap[pid] + gpuID = torch.cuda.current_device() + device = torch.device(f"cuda:{gpuID}") + torch.cuda.set_device(device) + args = [el.to(device) if isinstance(el, torch.Tensor) else el for el in args] + torch.cuda.synchronize() + max_err_ratio = 0.0 + try: + res = None + us = float("inf") + try: + res, us = run_perftest(func, *args, **kwargs) + us = round(us, 4) + except RuntimeError as e: + print(f"run gpu func error: info:{info}\t {e}") + max_retries = 3 + retry_count = 0 + + while us == 0 and retry_count < max_retries: + print(f"!!!! us = 0, try {retry_count + 1} run") + res, us = run_perftest(func, *args, **kwargs) + retry_count += 1 + if us == 0: + print(f"Warning: try run {max_retries} times, but still get 0!") + torch.cuda.synchronize() + if ref is not None: + if isinstance(ref, torch.Tensor): + ref = [ref] + if isinstance(res, torch.Tensor): + res = [res] + ref = [ + ( + el.to(device) + if isinstance(el, torch.Tensor) and el.device != device + else el + ) + for el in ref + ] + for i in range(len(ref)): + if isinstance(ref[i], torch.Tensor): + if res[i].shape != ref[i].shape: + res[i] = res[i].view(-1)[: ref[i].numel()].view(ref[i].shape) + if ref[i].dtype.itemsize == 1: + ref[i] = ref[i].to(dtypes.fp32) + res[i] = res[i].to(dtypes.fp32) + err_ratio = checkAllclose( + ref[i], + res[i], + atol=atol, + rtol=rtol, + tol_err_ratio=tol_err_ratio, + printLog=printLog, + msg=f"info:{info} res[{i}] ", + ) + max_err_ratio = max(max_err_ratio, err_ratio) + + except Exception as e: + print(f"Error in process:{pid} info:{info}: {e}") + # if res is None and ref is not None: + # print("The output is None, can't match with reference") + us = float("inf") + max_err_ratio = 1.0 + return info, us, round(max_err_ratio, 4) + + +def get_pid(): + time.sleep(3) + return mp.current_process().pid + + +def work_group(gpuIDMap, fast_mode, err_ratio, in_data, tasks): + group_task = [tasks] if not isinstance(tasks, list) else tasks + kernels_num, (input_data) = in_data + ( + info, + gen_data, + gen_args, + func, + args, + kwargs, + ref_func, + ref_args, + ref_kwargs, + ref, + *rest, + ) = group_task[0] + + pid = mp.current_process().pid + gpuID = gpuIDMap[pid] + device = torch.device(f"cuda:{gpuID}") + torch.cuda.set_device(device) + data = ( + gen_data(*gen_args, device=device) + if not input_data and gen_data is not None + else input_data + ) + + assert ref_func is not None or ref is not None or fast_mode != 0 + # ref=None & ref_func=None & fast_mode=1: fast tune, not compare results, do not postprocess,return all results + # ref=None & fast_mode=0: ref_func should be given and return best result + # (ref!=None | ref_func!=None) & fast_mode=1: compare results and return all results, but do not postprocess + # (ref!=None | ref_func!=None) & fast_mode=0: return best result, postprocess + if ref is None and not fast_mode or (ref_func is not None and fast_mode): + ref_data_idx, *rest = ([], *ref_args) if not data else ref_args + updated_ref_args = tuple(data[i] for i in ref_data_idx) + tuple(rest) + ref = ref_func(*updated_ref_args, **ref_kwargs) + torch.cuda.synchronize() + + rets = [] + shape_grouped = isinstance(tasks, list) + solutions = 1 if not shape_grouped else kernels_num + for i in range(solutions): + ( + info, + gen_data, + gen_args, + func, + args, + kwargs, + ref_func, + ref_args, + ref_kwargs, + ref_noused, + *rest, + ) = group_task[i] + # either gen_data func or inpur data + + new_args = ( + (tuple(data[i] for i in args[0]) + tuple(args[1:])) + if gen_data is not None + else args + ) + + ref = ref if ref_noused is None else ref_noused + work_args = ( + info, + func, + new_args, + kwargs, + ref, + *rest, + ) + ret = worker(gpuIDMap, *work_args, tol_err_ratio=err_ratio) + print(f"{ret}") + rets.append(ret) + return rets + + +def mp_tuner( + tasks, in_datas, mp_num=0, fast_mode=0, shape_grouped=False, err_ratio=0.05 +): + gpu_num = torch.cuda.device_count() + mp.set_start_method("spawn", force=True) + mp_num = gpu_num if mp_num < 1 or mp_num > gpu_num else mp_num + parallel_num = mp_num + start_idx = 0 + if mp_num == 1 & fast_mode == 0: + shape_grouped = True + pool = mp.Pool(processes=parallel_num) + + pids = [pool.apply_async(get_pid) for i in range(start_idx, mp_num)] + # time.sleep(2) + task_group = [] + # dispatch per shape to one pid + if not tasks: + return [] + if shape_grouped: + start = 0 + for kernel_nums, _ in in_datas: + end = start + kernel_nums - 1 + task_group.append(tasks[start : end + 1]) + start = end + 1 + else: + task_group = tasks + gpu_map = {el.get(): i + start_idx for i, el in enumerate(pids)} + # to get index of input data for task_group + import numpy as np + + # Calculate reference data index for each task group + ref_data_index = [i for i in range(len(in_datas))] + if not shape_grouped: + # For non-shape-grouped tasks, calculate cumulative sum of kernel numbers + cumulative = np.cumsum([size for size, _ in in_datas]) + # Find which input data each task group belongs to using binary search + ref_data_index = np.searchsorted( + cumulative, np.arange(len(task_group)), side="right" + ) + rets = [ + pool.apply_async( + work_group, + args=( + gpu_map, + fast_mode, + err_ratio, + in_datas[ref_data_index[k]], + task_group[k], + ), + ) + for k in range(len(task_group)) + ] + + pool.close() + pool.join() + + import itertools + + if shape_grouped: + result = list(itertools.chain.from_iterable(el.get() for el in rets)) + else: + result = [el.get()[0] for el in rets] + return result \ No newline at end of file diff --git a/aiter_logs/readme.md b/aiter_logs/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..9558fad2b15ccbdcc7c9acaa759b002c79e5b23a --- /dev/null +++ b/aiter_logs/readme.md @@ -0,0 +1 @@ +python3 ./aiter_logs/run.py ./aiter_logs/ ar_asm all \ No newline at end of file diff --git a/aiter_logs/run.py b/aiter_logs/run.py new file mode 100644 index 0000000000000000000000000000000000000000..e6a93e5d7d56874c27512b779e06630858fd2a9d --- /dev/null +++ b/aiter_logs/run.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: MIT +import json +import argparse +import os +import csv +import sys + + +def get_num_gpu(): + return 8 + + +def load_json_files(directory): + json_data = {} + for root, _, files in os.walk(directory): + for file in files: + if file.endswith('json'): + with open(directory+file, 'r') as f: + data = json.load(f) + json_data.setdefault('traceEvents', []).append( + data['traceEvents']) + return json_data + + +def parse(json_file_path, output_file_name, function_name): + + data_all = load_json_files(json_file_path) + data = data_all['traceEvents'] + + kernels = [] + found = False + for entries in data: + for entry in entries: + # if 'dur' in entry: + # kernels.append(entry) + # found = True + if 'name' in entry and 'cat' in entry and (entry['cat'] in ['kernel', 'gpu_memcpy', 'gpu_user_annotation']): + if function_name == 'all': + kernels.append(entry) + found = True + elif function_name in entry['name']: + kernels.append(entry) + found = True + elif 'cat' in entry and (entry['cat'] not in ['ac2g', 'cuda_runtime', 'python_function', 'cpu_instant_event', 'cpu_op', 'user_annotation', 'Trace']): + print(entry['cat']) + if not found: + print('There is no ' + function_name + ' in this log') + return + + sorted_kernels = sorted(kernels, key=lambda x: (x['ts'], x['pid'])) + + csv_file_name = output_file_name + '.csv' + json_file_out = output_file_name + '.json' + + json_data_out = {} + json_data_out.setdefault('traceEvents', []).append({}) + + with open(csv_file_name, 'w', newline='') as csvfile: + fieldnames = ['pid', 'dur', 'ts', 'min_dur', 'max_dur', 'min_start', 'max_start', 'latency_before_first_gpu', + 'max_dur - min_dur', 'duration_from_last_arrival', 'first_gpu', 'last_gpu', 'shortest_gpu', 'longest_gpu'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + i = 1 + min_dur = sys.float_info.max + max_dur = sys.float_info.min + min_start = sys.float_info.max + max_start = sys.float_info.min + first_gpu = 0 + last_gpu = 0 + longest_gpu = 0 + shortest_gpu = 0 + for entry in sorted_kernels: + record = {'pid': entry['pid'], 'dur': entry['dur'], + 'ts': entry['ts']} + json_data_out.setdefault('traceEvents', []).append(entry) + if entry['dur'] < min_dur: + min_dur = min(min_dur, entry['dur']) + shortest_gpu = entry['pid'] + + if entry['dur'] > max_dur: + max_dur = max(max_dur, entry['dur']) + longest_gpu = entry['pid'] + + if entry['ts'] < min_start: + min_start = min(min_start, entry['ts']) + first_gpu = entry['pid'] + + if entry['ts'] > max_start: + max_start = max(max_start, entry['ts']) + duration_from_last_arrival = entry['dur'] + last_gpu = entry['pid'] + + writer.writerow(record) + if (i) % get_num_gpu() == 0: + record = {'min_dur': min_dur, 'max_dur': max_dur, 'min_start': min_start, 'max_start': max_start, 'latency_before_first_gpu': max_start-min_start, 'max_dur - min_dur': max_dur - + min_dur, 'duration_from_last_arrival': duration_from_last_arrival, 'first_gpu': first_gpu, 'last_gpu': last_gpu, 'shortest_gpu': shortest_gpu, 'longest_gpu': longest_gpu} + writer.writerow(record) + csvfile.write('\n') + min_dur = sys.float_info.max + max_dur = sys.float_info.min + min_start = sys.float_info.max + max_start = sys.float_info.min + first_gpu = 0 + last_gpu = 0 + longest_gpu = 0 + shortest_gpu = 0 + + i = 0 + i = i + 1 + + with open(json_file_out, 'w') as jsonfileout: + json.dump(json_data_out, jsonfileout, indent=4) + + print(f"Data successfully written to {csv_file_name} and {json_file_out}.") + + +def main(): + parser = argparse.ArgumentParser( + description='Json file and the function to parse.') + + parser.add_argument('json_file_path', metavar='file_path', + type=str, help='Path to the JSON file to process') + parser.add_argument('output_file_name', type=str, help='Output File Name') + parser.add_argument('function_name', type=str, + help='Kernel Function Name, e.g., oneShotAllReduce, ncclDevKernel_Generic, mscclKernel') + + args = parser.parse_args() + parse(args.json_file_path, args.output_file_name, args.function_name) + + +if __name__ == '__main__': + main() diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..6c96f9b90fad80832ffe868911bf602a316fcac8 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,1908 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 +.set vgprValuB_X0_I0, 244 +.set vgprValuB_X1_I0, 248 +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprMask1, 54 +.set vgprMask2, 55 +.set vgprBFtemp, 56 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x32_0 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x32_1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 8 +.set MT1, 32 + +.set LDS_B_OFFSET, 512 +.set LDS_BLK_OFFSET, 2560 +.set LDS_BLK_OFFSET_64Kmasked, 2560 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 32 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +//s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 3 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x10000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 2 //x4 load +.set LOG2_COALESCE_THREAD_A, 1 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 + +// mcc +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesA] // notice +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp1] + +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // + + +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprTemp1], s[sgprTemp1], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprGlobalReadOffsetB+1] +v_add_u32 v[vgprGlobalReadOffsetB+2], 16, v[vgprGlobalReadOffsetB+0] + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + +v_add_u32 v[vgprGlobalReadOffsetB+2], 16, v[vgprGlobalReadOffsetB+0] +v_add_u32 v[vgprTemp0], 16, v[vgprTemp0] +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+3], 0x1, v[vgprGlobalReadOffsetB+3] + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +//s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +//s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +//s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +//s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] + +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], 0x4000 +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] +// mcc +s_add_u32 s[sgprLDSMask], 0x4000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 0x4000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x10, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x20, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x30, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x110, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x120, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x130, v[vgprLocalReadAddrA] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +//s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad + +s_mov_b32 s[sgprLocalWriteAddrB], 0x0 +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET + +s_mul_i32 s[sgprTemp0], s[sgprGlWaveID], 0x4000 +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +s_mul_i32 s[sgprTemp0], s[sgprGlWaveID], 0x4000 +v_add_u32 v[vgprLocalReadAddrB], s[sgprTemp0], v[vgprLocalReadAddrB] + + +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] + + +s_mul_i32 s[sgprTemp0], s[sgprWaveID], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +s_mov_b32 s[sgprTemp1], s[sgprStridesA] +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] + +s_mul_i32 s[sgprTemp0], s[sgprWaveID], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp0:sgprTemp1], 2 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp3] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2:vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + + + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +//ds_read_m32x16_b16 v[vgprValuA_X0_I0+ 0:vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+0] offset:\off + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+1024 +.endm + +.macro LDS_LOADAB1 off:req + +//ds_read_m32x16_b16 v[vgprValuA_X1_I0+ 0:vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+1056 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +// mcc +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 7 + +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL + + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLDSMask] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_cmp_gt_u32 s[sgprTemp0], s[sgprLDSMask] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_cmp_gt_u32 s[sgprTemp0], s[sgprLDSMask] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(12) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(9) +s_barrier +Last3: +s_waitcnt vmcnt(6) +s_barrier +Last2: +s_waitcnt vmcnt(3) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 32 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 8 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +//s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +//s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +//v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 + +GLOBAL_INC_Scale_Zero + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + +/* +.if debug_buffer + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrB+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X0_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif +*/ + + + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 + + + +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] +v_mov_b32 v[vgprLocalReadC], v[vgprLocalWriteC] +v_add_u32 v[vgprLocalReadC+1], 0x4000, v[vgprLocalReadC] +v_add_u32 v[vgprLocalReadC+2], 0x8000, v[vgprLocalReadC] +v_add_u32 v[vgprLocalReadC+3], 0xC000, v[vgprLocalReadC] + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_mul_u32_u24 v[vgprTemp1], 0x4000, v[vgprTemp1] +v_add_u32 v[vgprLocalWriteC], v[vgprLocalWriteC], v[vgprTemp1] + + + +s_barrier + +s_cmp_eq_u32 s[sgprWaveID], 0 +s_cbranch_scc0 Skip_Wave0 + +//ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +//ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +//ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +//ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[8], v[1] +v_mov_b32 v[12], v[5] + +//ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:0 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:0 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:0 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +//ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:256 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:256 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:256 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +//ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:512 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:512 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:512 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +//ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:768 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:768 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:768 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave0: + + +s_cmp_eq_u32 s[sgprWaveID], 1 +s_cbranch_scc0 Skip_Wave1 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +//ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +//ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +//ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +//ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[1], v[2] +v_mov_b32 v[5], v[6] +v_mov_b32 v[9], v[3] +v_mov_b32 v[13], v[7] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:1024 +//ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:1024 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:1024 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:1024 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:1280 +//ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:1280 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:1280 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:1280 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:1536 +//ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:1536 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:1536 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:1536 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:1792 +//ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:1792 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:1792 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:1792 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave1: + +s_cmp_eq_u32 s[sgprWaveID], 2 +s_cbranch_scc0 Skip_Wave2 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +//ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +//ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +//ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +//ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[2], v[8] +v_mov_b32 v[6], v[12] +v_mov_b32 v[10], v[9] +v_mov_b32 v[14], v[13] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:2048 +//ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:2048 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:2048 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:2304 +//ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:2304 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:2304 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:2560 +//ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:2560 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:2560 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:2816 +//ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:2816 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:2816 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave2: + +s_cmp_eq_u32 s[sgprWaveID], 3 +s_cbranch_scc0 Skip_Wave3 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +//ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +//ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +//ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +//ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[3], v[10] +v_mov_b32 v[7], v[14] +v_mov_b32 v[11], v[11] +v_mov_b32 v[15], v[15] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:3072 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:3072 +//ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:3072 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:3328 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:3328 +//ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:3328 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:3584 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:3584 +//ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:3584 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:3840 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:3840 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:3840 +//ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:3840 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave3: +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+4] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+8] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+12] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 32 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 32*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +//s_mov_b32 s[sgprTemp1], s[sgprWaveID] +//s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +//v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..590afd0c509e3cb4078762770706602be576c326 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,2845 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 61440 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 61440 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 + + +.set vgprValuB_X0_I0, 64 +.set vgprValuB_X1_I0, 80 + +.set vgprValuA_X0_H0, 96 +.set vgprValuA_X1_H0, 104 +.set vgprValuA_X2_I0, 112 +.set vgprValuA_X3_I0, 116 + + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + + +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprMask1, 252 +.set vgprMask2, 253 +.set vgprBFtemp, 254 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_bf16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_bf16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 128 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 10240 +.set LDS_BLK_OFFSET_64Kmasked, 10240 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 128 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0xF000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +//v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprGlobalReadOffsetB+1] +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprGlobalReadOffsetB+0] + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] + +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprTemp0+0] +v_mul_lo_u32 v[vgprGlobalReadOffsetB+3], s[sgprStrideStruct], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1], v[vgprGlobalReadOffsetB+3] + +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+3], 0x1, v[vgprGlobalReadOffsetB+3] +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +/* +v_and_b32 v3, 1, v1 +v_mul_u32_u24 v3, 0x40, v3 +v_mul_u32_u24 v2, WAVE_LDS_OFFSET_A+0, v1 +v_add_u32 v2, v3, v2 +*/ +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + + + +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B*4 +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2:vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 + +ds_read_b64 v[vgprValuB_X0_I0+ 8:vgprValuB_X0_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4096 +ds_read_b64 v[vgprValuB_X0_I0+ 10:vgprValuB_X0_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4352 +ds_read_b64 v[vgprValuB_X0_I0+ 12:vgprValuB_X0_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4608 +ds_read_b64 v[vgprValuB_X0_I0+ 14:vgprValuB_X0_I0+ 15], v[vgprLocalReadAddrB+1] offset:\off+4096 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 + +ds_read_b64 v[vgprValuB_X1_I0+ 8:vgprValuB_X1_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4128 +ds_read_b64 v[vgprValuB_X1_I0+ 10:vgprValuB_X1_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4384 +ds_read_b64 v[vgprValuB_X1_I0+ 12:vgprValuB_X1_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4640 +ds_read_b64 v[vgprValuB_X1_I0+ 14:vgprValuB_X1_I0+ 15], v[vgprLocalReadAddrB+2] offset:\off+4096 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(12) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(9) +s_barrier +Last3: +s_waitcnt vmcnt(6) +s_barrier +Last2: +s_waitcnt vmcnt(3) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 +v_mov_b32 v[vgprValuC+32], 0x0 +v_mov_b32 v[vgprValuC+33], 0x0 +v_mov_b32 v[vgprValuC+34], 0x0 +v_mov_b32 v[vgprValuC+35], 0x0 +v_mov_b32 v[vgprValuC+36], 0x0 +v_mov_b32 v[vgprValuC+37], 0x0 +v_mov_b32 v[vgprValuC+38], 0x0 +v_mov_b32 v[vgprValuC+39], 0x0 +v_mov_b32 v[vgprValuC+40], 0x0 +v_mov_b32 v[vgprValuC+41], 0x0 +v_mov_b32 v[vgprValuC+42], 0x0 +v_mov_b32 v[vgprValuC+43], 0x0 +v_mov_b32 v[vgprValuC+44], 0x0 +v_mov_b32 v[vgprValuC+45], 0x0 +v_mov_b32 v[vgprValuC+46], 0x0 +v_mov_b32 v[vgprValuC+47], 0x0 +v_mov_b32 v[vgprValuC+48], 0x0 +v_mov_b32 v[vgprValuC+49], 0x0 +v_mov_b32 v[vgprValuC+50], 0x0 +v_mov_b32 v[vgprValuC+51], 0x0 +v_mov_b32 v[vgprValuC+52], 0x0 +v_mov_b32 v[vgprValuC+53], 0x0 +v_mov_b32 v[vgprValuC+54], 0x0 +v_mov_b32 v[vgprValuC+55], 0x0 +v_mov_b32 v[vgprValuC+56], 0x0 +v_mov_b32 v[vgprValuC+57], 0x0 +v_mov_b32 v[vgprValuC+58], 0x0 +v_mov_b32 v[vgprValuC+59], 0x0 +v_mov_b32 v[vgprValuC+60], 0x0 +v_mov_b32 v[vgprValuC+61], 0x0 +v_mov_b32 v[vgprValuC+62], 0x0 +v_mov_b32 v[vgprValuC+63], 0x0 + +GLOBAL_INC_Scale_Zero + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff + + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 + + + +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 + +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) + + +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mov_b32 s[sgprTemp1], s[sgprWaveID] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + + + + + + + + + +.set Nvoff, 32 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 33 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 34 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 35 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 40 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 41 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 42 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 43 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 48 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 49 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 50 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 51 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 56 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 57 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 58 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 59 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s new file mode 100644 index 0000000000000000000000000000000000000000..792bcbfd1901f03a50bb0f8b27c3b00a42ee36ec --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s @@ -0,0 +1,2741 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +.set vgprValuB_X0_I0, 64 +.set vgprValuB_X1_I0, 80 + +.set vgprValuA_X0_H0, 96 +.set vgprValuA_X1_H0, 104 +.set vgprValuA_X2_I0, 112 +.set vgprValuA_X3_I0, 116 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprMask1, 252 +.set vgprMask2, 253 +.set vgprBFtemp, 254 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 211 +.set vgprGlobalReadOffsetB, 212 +.set vgprGlobalReadOffsetB1, 216 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_bf16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // + +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_bf16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // + +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 128 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 10240 +.set LDS_BLK_OFFSET_64Kmasked, 10240 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x7800 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprGlobalReadOffsetB+1] +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprGlobalReadOffsetB+0] + + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] +v_mov_b32 v[vgprGlobalReadOffsetB1+2], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB1+3], s[sgprSizesSum], v[vgprGlobalReadOffsetB+3] + + + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + + +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprTemp0+0] +v_mul_lo_u32 v[vgprGlobalReadOffsetB+3], s[sgprStrideStruct], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1], v[vgprGlobalReadOffsetB+3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+3], 0x1, v[vgprGlobalReadOffsetB+3] + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + +v_mov_b32 v[vgprGlobalReadOffsetB1+2], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB1+3], s[sgprSizesSum], v[vgprGlobalReadOffsetB+3] + + + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], s[sgprTemp0] + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B*4 +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2:vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB1], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB1:vgprGlobalReadOffsetB1+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B*4 +buffer_load_dwordx4 v[vgprGlobalReadOffsetB1+2:vgprGlobalReadOffsetB1+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf + +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 + +ds_read_b64 v[vgprValuB_X0_I0+ 8:vgprValuB_X0_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4096 +ds_read_b64 v[vgprValuB_X0_I0+ 10:vgprValuB_X0_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4352 +ds_read_b64 v[vgprValuB_X0_I0+ 12:vgprValuB_X0_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4608 +ds_read_b64 v[vgprValuB_X0_I0+ 14:vgprValuB_X0_I0+ 15], v[vgprLocalReadAddrB+1] offset:\off+4096 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 + +ds_read_b64 v[vgprValuB_X1_I0+ 8:vgprValuB_X1_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4128 +ds_read_b64 v[vgprValuB_X1_I0+ 10:vgprValuB_X1_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4384 +ds_read_b64 v[vgprValuB_X1_I0+ 12:vgprValuB_X1_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4640 +ds_read_b64 v[vgprValuB_X1_I0+ 14:vgprValuB_X1_I0+ 15], v[vgprLocalReadAddrB+2] offset:\off+4096 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 3, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB1+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + + + + +s_cmp_lt_i32 s[sgprTemp3], 1 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(6) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x6 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 +v_mov_b32 v[vgprValuC+32], 0x0 +v_mov_b32 v[vgprValuC+33], 0x0 +v_mov_b32 v[vgprValuC+34], 0x0 +v_mov_b32 v[vgprValuC+35], 0x0 +v_mov_b32 v[vgprValuC+36], 0x0 +v_mov_b32 v[vgprValuC+37], 0x0 +v_mov_b32 v[vgprValuC+38], 0x0 +v_mov_b32 v[vgprValuC+39], 0x0 +v_mov_b32 v[vgprValuC+40], 0x0 +v_mov_b32 v[vgprValuC+41], 0x0 +v_mov_b32 v[vgprValuC+42], 0x0 +v_mov_b32 v[vgprValuC+43], 0x0 +v_mov_b32 v[vgprValuC+44], 0x0 +v_mov_b32 v[vgprValuC+45], 0x0 +v_mov_b32 v[vgprValuC+46], 0x0 +v_mov_b32 v[vgprValuC+47], 0x0 +v_mov_b32 v[vgprValuC+48], 0x0 +v_mov_b32 v[vgprValuC+49], 0x0 +v_mov_b32 v[vgprValuC+50], 0x0 +v_mov_b32 v[vgprValuC+51], 0x0 +v_mov_b32 v[vgprValuC+52], 0x0 +v_mov_b32 v[vgprValuC+53], 0x0 +v_mov_b32 v[vgprValuC+54], 0x0 +v_mov_b32 v[vgprValuC+55], 0x0 +v_mov_b32 v[vgprValuC+56], 0x0 +v_mov_b32 v[vgprValuC+57], 0x0 +v_mov_b32 v[vgprValuC+58], 0x0 +v_mov_b32 v[vgprValuC+59], 0x0 +v_mov_b32 v[vgprValuC+60], 0x0 +v_mov_b32 v[vgprValuC+61], 0x0 +v_mov_b32 v[vgprValuC+62], 0x0 +v_mov_b32 v[vgprValuC+63], 0x0 + +GLOBAL_INC_Scale_Zero + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 + +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +WaveID_gecase: + + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW4_7: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + + + +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW4_7 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] + +s_mul_i32 s[sgprTemp2], 8192, s[sgprWaveID] +v_add_u32 v[vgprLocalWriteC], s[sgprTemp2], v[vgprLocalWriteC] + +s_barrier + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3 +v_add_u32 v[vgprLocalReadC], 0x8000, v[vgprLocalWriteC] +Skip_Wave0_3: + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7 +s_mov_b32 s[sgprTemp2], 0x8000 +v_sub_u32 v[vgprLocalReadC], v[vgprLocalWriteC], s[sgprTemp2] +Skip_Wave4_7: + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3_W +ds_write_b32 v[vgprLocalWriteC], v[32], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[36], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[33], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[37], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[34], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[38], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[35], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[39], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[40], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[44], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[41], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[45], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[42], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[46], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[43], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[47], offset:3840 + +ds_write_b32 v[vgprLocalWriteC], v[48], offset:4096 +ds_write_b32 v[vgprLocalWriteC], v[52], offset:4352 +ds_write_b32 v[vgprLocalWriteC], v[49], offset:4608 +ds_write_b32 v[vgprLocalWriteC], v[53], offset:4864 +ds_write_b32 v[vgprLocalWriteC], v[50], offset:5120 +ds_write_b32 v[vgprLocalWriteC], v[54], offset:5376 +ds_write_b32 v[vgprLocalWriteC], v[51], offset:5632 +ds_write_b32 v[vgprLocalWriteC], v[55], offset:5888 + +ds_write_b32 v[vgprLocalWriteC], v[56], offset:6144 +ds_write_b32 v[vgprLocalWriteC], v[60], offset:6400 +ds_write_b32 v[vgprLocalWriteC], v[57], offset:6656 +ds_write_b32 v[vgprLocalWriteC], v[61], offset:6912 +ds_write_b32 v[vgprLocalWriteC], v[58], offset:7168 +ds_write_b32 v[vgprLocalWriteC], v[62], offset:7424 +ds_write_b32 v[vgprLocalWriteC], v[59], offset:7680 +ds_write_b32 v[vgprLocalWriteC], v[63], offset:7936 + +s_waitcnt lgkmcnt(0) +s_barrier +ds_read_b32 v[vgprTmpValC+32], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+36], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+33], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+37], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+34], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+38], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+35], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+39], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+40], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+44], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+41], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+45], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+42], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+46], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+43], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+47], v[vgprLocalReadC+0] offset:3840 + + +ds_read_b32 v[vgprTmpValC+48], v[vgprLocalReadC+0] offset:4096 +ds_read_b32 v[vgprTmpValC+52], v[vgprLocalReadC+0] offset:4352 +ds_read_b32 v[vgprTmpValC+49], v[vgprLocalReadC+0] offset:4608 +ds_read_b32 v[vgprTmpValC+53], v[vgprLocalReadC+0] offset:4864 +ds_read_b32 v[vgprTmpValC+50], v[vgprLocalReadC+0] offset:5120 +ds_read_b32 v[vgprTmpValC+54], v[vgprLocalReadC+0] offset:5376 +ds_read_b32 v[vgprTmpValC+51], v[vgprLocalReadC+0] offset:5632 +ds_read_b32 v[vgprTmpValC+55], v[vgprLocalReadC+0] offset:5888 +ds_read_b32 v[vgprTmpValC+56], v[vgprLocalReadC+0] offset:6144 +ds_read_b32 v[vgprTmpValC+60], v[vgprLocalReadC+0] offset:6400 +ds_read_b32 v[vgprTmpValC+57], v[vgprLocalReadC+0] offset:6656 +ds_read_b32 v[vgprTmpValC+61], v[vgprLocalReadC+0] offset:6912 +ds_read_b32 v[vgprTmpValC+58], v[vgprLocalReadC+0] offset:7168 +ds_read_b32 v[vgprTmpValC+62], v[vgprLocalReadC+0] offset:7424 +ds_read_b32 v[vgprTmpValC+59], v[vgprLocalReadC+0] offset:7680 +ds_read_b32 v[vgprTmpValC+63], v[vgprLocalReadC+0] offset:7936 + +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+32] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+36] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+33] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+37] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+34] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+38] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+35] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+39] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+40] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+44] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+41] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+45] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+42] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+46] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+43] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+47] + +v_add_f32 v[vgprTmpValC+16], v[vgprTmpValC+16], v[vgprTmpValC+48] +v_add_f32 v[vgprTmpValC+20], v[vgprTmpValC+20], v[vgprTmpValC+52] +v_add_f32 v[vgprTmpValC+17], v[vgprTmpValC+17], v[vgprTmpValC+49] +v_add_f32 v[vgprTmpValC+21], v[vgprTmpValC+21], v[vgprTmpValC+53] +v_add_f32 v[vgprTmpValC+18], v[vgprTmpValC+18], v[vgprTmpValC+50] +v_add_f32 v[vgprTmpValC+22], v[vgprTmpValC+22], v[vgprTmpValC+54] +v_add_f32 v[vgprTmpValC+19], v[vgprTmpValC+19], v[vgprTmpValC+51] +v_add_f32 v[vgprTmpValC+23], v[vgprTmpValC+23], v[vgprTmpValC+55] + +v_add_f32 v[vgprTmpValC+24], v[vgprTmpValC+24], v[vgprTmpValC+56] +v_add_f32 v[vgprTmpValC+28], v[vgprTmpValC+28], v[vgprTmpValC+60] +v_add_f32 v[vgprTmpValC+25], v[vgprTmpValC+25], v[vgprTmpValC+57] +v_add_f32 v[vgprTmpValC+29], v[vgprTmpValC+29], v[vgprTmpValC+61] +v_add_f32 v[vgprTmpValC+26], v[vgprTmpValC+26], v[vgprTmpValC+58] +v_add_f32 v[vgprTmpValC+30], v[vgprTmpValC+30], v[vgprTmpValC+62] +v_add_f32 v[vgprTmpValC+27], v[vgprTmpValC+27], v[vgprTmpValC+59] +v_add_f32 v[vgprTmpValC+31], v[vgprTmpValC+31], v[vgprTmpValC+63] +Skip_Wave0_3_W: + + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7_W + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +ds_write_b32 v[vgprLocalWriteC], v[16], offset:4096 +ds_write_b32 v[vgprLocalWriteC], v[20], offset:4352 +ds_write_b32 v[vgprLocalWriteC], v[17], offset:4608 +ds_write_b32 v[vgprLocalWriteC], v[21], offset:4864 +ds_write_b32 v[vgprLocalWriteC], v[18], offset:5120 +ds_write_b32 v[vgprLocalWriteC], v[22], offset:5376 +ds_write_b32 v[vgprLocalWriteC], v[19], offset:5632 +ds_write_b32 v[vgprLocalWriteC], v[23], offset:5888 + +ds_write_b32 v[vgprLocalWriteC], v[24], offset:6144 +ds_write_b32 v[vgprLocalWriteC], v[28], offset:6400 +ds_write_b32 v[vgprLocalWriteC], v[25], offset:6656 +ds_write_b32 v[vgprLocalWriteC], v[29], offset:6912 +ds_write_b32 v[vgprLocalWriteC], v[26], offset:7168 +ds_write_b32 v[vgprLocalWriteC], v[30], offset:7424 +ds_write_b32 v[vgprLocalWriteC], v[27], offset:7680 +ds_write_b32 v[vgprLocalWriteC], v[31], offset:7936 + +s_waitcnt lgkmcnt(0) +s_barrier + + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+0] offset:3840 + +ds_read_b32 v[vgprTmpValC+16], v[vgprLocalReadC+0] offset:4096 +ds_read_b32 v[vgprTmpValC+20], v[vgprLocalReadC+0] offset:4352 +ds_read_b32 v[vgprTmpValC+17], v[vgprLocalReadC+0] offset:4608 +ds_read_b32 v[vgprTmpValC+21], v[vgprLocalReadC+0] offset:4864 +ds_read_b32 v[vgprTmpValC+18], v[vgprLocalReadC+0] offset:5120 +ds_read_b32 v[vgprTmpValC+22], v[vgprLocalReadC+0] offset:5376 +ds_read_b32 v[vgprTmpValC+19], v[vgprLocalReadC+0] offset:5632 +ds_read_b32 v[vgprTmpValC+23], v[vgprLocalReadC+0] offset:5888 +ds_read_b32 v[vgprTmpValC+24], v[vgprLocalReadC+0] offset:6144 +ds_read_b32 v[vgprTmpValC+28], v[vgprLocalReadC+0] offset:6400 +ds_read_b32 v[vgprTmpValC+25], v[vgprLocalReadC+0] offset:6656 +ds_read_b32 v[vgprTmpValC+29], v[vgprLocalReadC+0] offset:6912 +ds_read_b32 v[vgprTmpValC+26], v[vgprLocalReadC+0] offset:7168 +ds_read_b32 v[vgprTmpValC+30], v[vgprLocalReadC+0] offset:7424 +ds_read_b32 v[vgprTmpValC+27], v[vgprLocalReadC+0] offset:7680 +ds_read_b32 v[vgprTmpValC+31], v[vgprLocalReadC+0] offset:7936 + +s_waitcnt lgkmcnt(0) + +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+32] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+36] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+33] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+37] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+34] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+38] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+35] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+39] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+40] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+44] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+41] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+45] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+42] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+46] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+43] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+47] + +v_add_f32 v[vgprTmpValC+16], v[vgprTmpValC+16], v[vgprTmpValC+48] +v_add_f32 v[vgprTmpValC+20], v[vgprTmpValC+20], v[vgprTmpValC+52] +v_add_f32 v[vgprTmpValC+17], v[vgprTmpValC+17], v[vgprTmpValC+49] +v_add_f32 v[vgprTmpValC+21], v[vgprTmpValC+21], v[vgprTmpValC+53] +v_add_f32 v[vgprTmpValC+18], v[vgprTmpValC+18], v[vgprTmpValC+50] +v_add_f32 v[vgprTmpValC+22], v[vgprTmpValC+22], v[vgprTmpValC+54] +v_add_f32 v[vgprTmpValC+19], v[vgprTmpValC+19], v[vgprTmpValC+51] +v_add_f32 v[vgprTmpValC+23], v[vgprTmpValC+23], v[vgprTmpValC+55] + +v_add_f32 v[vgprTmpValC+24], v[vgprTmpValC+24], v[vgprTmpValC+56] +v_add_f32 v[vgprTmpValC+28], v[vgprTmpValC+28], v[vgprTmpValC+60] +v_add_f32 v[vgprTmpValC+25], v[vgprTmpValC+25], v[vgprTmpValC+57] +v_add_f32 v[vgprTmpValC+29], v[vgprTmpValC+29], v[vgprTmpValC+61] +v_add_f32 v[vgprTmpValC+26], v[vgprTmpValC+26], v[vgprTmpValC+58] +v_add_f32 v[vgprTmpValC+30], v[vgprTmpValC+30], v[vgprTmpValC+62] +v_add_f32 v[vgprTmpValC+27], v[vgprTmpValC+27], v[vgprTmpValC+59] +v_add_f32 v[vgprTmpValC+31], v[vgprTmpValC+31], v[vgprTmpValC+63] + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprLocalWriteC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + +Skip_Wave4_7_W: + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_dequant.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_dequant.s new file mode 100644 index 0000000000000000000000000000000000000000..3bb565f9c5d44501d81a8c38b07978a5da03ae7a --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_dequant.s @@ -0,0 +1,1382 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32_dq +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32_dq +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32_dq,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32_dq + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32_dq + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32_dq.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32_dq: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprGlobalWriteD_Edge, 230 + +.set vgprMask1, 252 +.set vgprMask2, 253 +.set vgprBFtemp, 254 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 212 +.set vgprGlobalReadOffsetB, 214 +.set vgprGlobalReadOffsetB1, 218 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + +.set MT0, 32 +.set MT1, 32 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 4096 +.set LDS_BLK_OFFSET_64Kmasked, 4096 +.set LOG2BPE, 1 +//.set BPE, 2 +.set BPE, 1 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +//s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x4000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWaveID] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprStridesA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +//s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +//s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mov_b32 s[sgprTemp1], s[sgprStridesA] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +s_lshr_b32 s[sgprTemp1], s[sgprTemp1], 1 +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 2*MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +//s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG + +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 + +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_and_b32 s[sgprTemp0], s[sgprStridesA], 15 +s_sub_u32 s[sgprTemp1], 16, s[sgprTemp0] +s_add_u32 s[sgprTemp1], s[sgprShadowLimitA+0], s[sgprTemp1] +s_cmp_gt_u32 s[sgprTemp0], 0 +s_cmov_b32 s[sgprShadowLimitA+0], s[sgprTemp1] + +//s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +//s_cbranch_scc1 label_SkipMmac + +//s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 1 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], 8*MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8*MT0 // WorkGroup[01] * MT + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[\vgprPack+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +.endm + + + +.macro WriteFp16ToGlobal vgprOut:req +buffer_store_short v[\vgprOut+0], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+1], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +buffer_store_short v[\vgprOut+2], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+3], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +buffer_store_short v[\vgprOut+4], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+5], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +buffer_store_short v[\vgprOut+6], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+7], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*13 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +.endm + + + + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 4, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +s_cmp_lt_i32 s[sgprTemp3], 2 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(4) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_waitcnt vmcnt(2) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +//s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] + +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesSum] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +//s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x4 + +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum], 0x1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +v_add_u32 v[vgprGlobalWriteOffsetD+1], v[vgprGlobalWriteOffsetD], 2 + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero + +GLOBAL_INC_Scale_Zero + + +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 + +v_and_b32 v[vgprGlobalWriteD_Edge+2], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalWriteD_Edge+2], 1, v[vgprGlobalWriteD_Edge+2] +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprGlobalWriteD_Edge+2], v[vgprGlobalWriteD_Edge+2], s[sgprTemp1] +v_lshlrev_b32 v[vgprGlobalWriteD_Edge+2], 1, v[vgprGlobalWriteD_Edge+2] +//v_mov_b32 v[vgprGlobalWriteD_Edge+3], v[vgprGlobalWriteD_Edge+2] //store inittial addr +//v_mov_b32 v[vgprGlobalWriteD_Edge+0], v[vgprGlobalWriteOffsetD] //store inittial addr + +//v_mov_b32 v[vgprGlobalWriteD_Edge+1], v[vgprGlobalWriteD_Edge+0] +//v_mov_b32 v[vgprGlobalWriteD_Edge+2], v[vgprGlobalWriteD_Edge+3] +//v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprGlobalWriteD_Edge+2], s[sgprD_MEdge] +v_cmp_gt_u32 s[sgprTemp2:sgprTemp2+1], v[vgprGlobalWriteD_Edge+2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], -1, s[sgprTemp2:sgprTemp2+1] +v_cndmask_b32 v[vgprGlobalWriteOffsetD+1], v[vgprGlobalWriteOffsetD+1], -1, s[sgprTemp2:sgprTemp2+1] + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 + +.if debug_buffer + +v_mov_b32 v[vgprDebugTmp], v[vgprValuScales+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuZeros+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + + + +.if debug_buffer + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + +//v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrA+0] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrA+1] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD+0] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD+1] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + + + +.endif + + + + + +WriteFp16ToGlobal vgprValuA_X2_I0 + +//s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +//s_waitcnt vmcnt(0) +WriteFp16ToGlobal vgprValuA_X3_I0 + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +WriteFp16ToGlobal vgprValuA_X2_I0 +s_waitcnt vmcnt(8) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +WriteFp16ToGlobal vgprValuA_X2_I0 +//s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +WriteFp16ToGlobal vgprValuA_X2_I0 +s_waitcnt vmcnt(8) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + + +WriteFp16ToGlobal vgprValuA_X2_I0 +s_waitcnt vmcnt(0) +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 + + +s_endpgm \ No newline at end of file diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..57a0fd124618b986db520c4beeb3c808a6d503b6 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,1854 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 24576 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 24576 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 +.set vgprValuB_X0_I0, 244 +.set vgprValuB_X1_I0, 248 +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprMask1, 54 +.set vgprMask2, 55 +.set vgprBFtemp, 56 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x32_0 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x32_1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 32 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 4096 +.set LDS_BLK_OFFSET_64Kmasked, 4096 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 32 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x6000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 5, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 31, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 7, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +/* +v_and_b32 v3, 1, v1 +v_mul_u32_u24 v3, 0x40, v3 +v_mul_u32_u24 v2, WAVE_LDS_OFFSET_A+0, v1 +v_add_u32 v2, v3, v2 +*/ +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x210 // L1477 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+1], 0x100, v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+2], 0x120, v[vgprLocalReadAddrB] + +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 9, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +/* +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm +*/ + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + + + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +//v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +//v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +//v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +//v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +//v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +//v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +//v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +//v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +//ds_read_m32x16_b16 v[vgprValuA_X0_I0+ 0:vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+0] offset:\off + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +//ds_read_m32x16_b16 v[vgprValuA_X1_I0+ 0:vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(6) +s_barrier +Last3: +s_waitcnt vmcnt(4) +s_barrier +Last2: +s_waitcnt vmcnt(2) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 + +GLOBAL_INC_Scale_Zero + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff + + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X0_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X0_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X0_I0+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X0_I0+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + + +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[8] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[1] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[8] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[9] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mov_b32 s[sgprTemp1], s[sgprWaveID] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] + +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s new file mode 100644 index 0000000000000000000000000000000000000000..423ba0c045d435b813392ec27c461d1a47ecc783 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s @@ -0,0 +1,2098 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprMask1, 54 +.set vgprMask2, 55 +.set vgprBFtemp, 56 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 212 +.set vgprGlobalReadOffsetB, 214 +.set vgprGlobalReadOffsetB1, 218 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 32 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 4096 +.set LDS_BLK_OFFSET_64Kmasked, 4096 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x4000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 5, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 31, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 7, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element + + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x210 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+1], 0x100, v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+2], 0x120, v[vgprLocalReadAddrB] + +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 9, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 512 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], s[sgprTemp0] + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetB1:vgprGlobalReadOffsetB1+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 4, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB1+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + + + + +s_cmp_lt_i32 s[sgprTemp3], 2 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_waitcnt vmcnt(4) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x4 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 + +GLOBAL_INC_Scale_Zero + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +WaveID_gecase: + + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW4_7: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCntCommon] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW4_7 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] + +s_mul_i32 s[sgprTemp2], 8192, s[sgprWaveID] +v_add_u32 v[vgprLocalWriteC], s[sgprTemp2], v[vgprLocalWriteC] +s_barrier + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3 +v_add_u32 v[vgprLocalReadC], 0x8000, v[vgprLocalWriteC] +Skip_Wave0_3: + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7 +s_mov_b32 s[sgprTemp2], 0x8000 +v_sub_u32 v[vgprLocalReadC], v[vgprLocalWriteC], s[sgprTemp2] +Skip_Wave4_7: + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3_W + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:1792 + +s_waitcnt lgkmcnt(0) +s_barrier +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+0] offset:1792 + +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+8] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+12] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+11] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+15] +Skip_Wave0_3_W: + + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7_W + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +s_waitcnt lgkmcnt(0) +s_barrier + + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+0] offset:1792 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+8] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+12] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+11] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+15] + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprLocalWriteC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + +Skip_Wave4_7_W: + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +/* +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 +*/ +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..83954c3d22eac3087aa2a4e292b9a0518f9e188e --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,2247 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 36864 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 36864 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprMask1, 54 +.set vgprMask2, 55 +.set vgprBFtemp, 56 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + + +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 64 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 6144 +.set LDS_BLK_OFFSET_64Kmasked, 6144 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x9000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +/* +v_and_b32 v3, 1, v1 +v_mul_u32_u24 v3, 0x40, v3 +v_mul_u32_u24 v2, WAVE_LDS_OFFSET_A+0, v1 +v_add_u32 v2, v3, v2 +*/ +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +.macro I4ToFp16_old vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 28, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 24, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 20, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 12, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 8, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 0, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0], 0xf +v_and_b32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1], 0xf +v_and_b32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2], 0xf +v_and_b32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3], 0xf +v_and_b32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4], 0xf +v_and_b32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5], 0xf +v_and_b32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6], 0xf +v_and_b32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7], 0xf + +I32ToF16 vgprValuA_X0_H0+0 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+1 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+2 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+3 \vgprZero+3 \vgprScale+3 +I32ToF16 vgprValuA_X0_H0+4 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+5 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+6 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+7 \vgprZero+3 \vgprScale+3 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+4] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+5] +v_pack_b32_f16 v[\vgprPack+4], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+6], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+7] + +v_lshrrev_b32 v[vgprValuA_X0_H0+15], 28, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+14], 24, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+13], 20, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+12], 16, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+11], 12, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+10], 8, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+9], 4, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+8], 0, v[\vgprIn+1] + +v_and_b32 v[vgprValuA_X0_H0+8], v[vgprValuA_X0_H0+8], 0xf +v_and_b32 v[vgprValuA_X0_H0+9], v[vgprValuA_X0_H0+9], 0xf +v_and_b32 v[vgprValuA_X0_H0+10], v[vgprValuA_X0_H0+10], 0xf +v_and_b32 v[vgprValuA_X0_H0+11], v[vgprValuA_X0_H0+11], 0xf +v_and_b32 v[vgprValuA_X0_H0+12], v[vgprValuA_X0_H0+12], 0xf +v_and_b32 v[vgprValuA_X0_H0+13], v[vgprValuA_X0_H0+13], 0xf +v_and_b32 v[vgprValuA_X0_H0+14], v[vgprValuA_X0_H0+14], 0xf +v_and_b32 v[vgprValuA_X0_H0+15], v[vgprValuA_X0_H0+15], 0xf + +I32ToF16 vgprValuA_X0_H0+8 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+9 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+10 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+11 \vgprZero+3 \vgprScale+3 +I32ToF16 vgprValuA_X0_H0+12 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+13 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+14 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+15 \vgprZero+3 \vgprScale+3 + +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+8], v[vgprValuA_X0_H0+12] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+9], v[vgprValuA_X0_H0+13] +v_pack_b32_f16 v[\vgprPack+5], v[vgprValuA_X0_H0+10], v[vgprValuA_X0_H0+14] +v_pack_b32_f16 v[\vgprPack+7], v[vgprValuA_X0_H0+11], v[vgprValuA_X0_H0+15] +.endm + + + + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(6) +s_barrier +Last3: +s_waitcnt vmcnt(4) +s_barrier +Last2: +s_waitcnt vmcnt(2) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 + +GLOBAL_INC_Scale_Zero + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuZeros+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetZero+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + +.endif + + + + +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuZerosI32+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuScalesF32+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mov_b32 s[sgprTemp1], s[sgprWaveID] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s new file mode 100644 index 0000000000000000000000000000000000000000..c3bda19c30663cfe99c1274808c00c0cacdf2894 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s @@ -0,0 +1,2489 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.globl Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.p2align 8 +.type Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .symbol: 'Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: bf16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_BBS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprMask1, 54 +.set vgprMask2, 55 +.set vgprBFtemp, 56 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 212 +.set vgprGlobalReadOffsetB, 214 +.set vgprGlobalReadOffsetB1, 218 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + + +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_bf16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_bf16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 64 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 6144 +.set LDS_BLK_OFFSET_64Kmasked, 6144 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x6000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element + + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], s[sgprTemp0] + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB1], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB1:vgprGlobalReadOffsetB1+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_lshlrev_b32 v[\vgprOut+1], 16, v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 16, v[vgprValuA_X0_H0+0] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+1], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 16, v[vgprValuA_X0_H0+1] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+2], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 16, v[vgprValuA_X0_H0+2] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+3], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 16, v[vgprValuA_X0_H0+3] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[vgprValuA_X0_H0+4] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+5], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 16, v[vgprValuA_X0_H0+5] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+6], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 16, v[vgprValuA_X0_H0+6] // convert C to bf16 + +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuA_X0_H0+7], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 16, v[vgprValuA_X0_H0+7] // convert C to bf16 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 4, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB1+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + + + + +s_cmp_lt_i32 s[sgprTemp3], 2 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_waitcnt vmcnt(4) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x5 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 + +GLOBAL_INC_Scale_Zero + +v_mov_b32 v[vgprMask1], 0x7fff0000 +v_mov_b32 v[vgprMask2], 0x7fff +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +WaveID_gecase: + + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW4_7: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCntCommon] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW4_7 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] + +s_mul_i32 s[sgprTemp2], 8192, s[sgprWaveID] +v_add_u32 v[vgprLocalWriteC], s[sgprTemp2], v[vgprLocalWriteC] + +s_barrier + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3 +v_add_u32 v[vgprLocalReadC], 0x8000, v[vgprLocalWriteC] +Skip_Wave0_3: + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7 +s_mov_b32 s[sgprTemp2], 0x8000 +v_sub_u32 v[vgprLocalReadC], v[vgprLocalWriteC], s[sgprTemp2] +Skip_Wave4_7: + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3_W +ds_write_b32 v[vgprLocalWriteC], v[16], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[20], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[17], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[21], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[18], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[22], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[19], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[23], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[24], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[28], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[25], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[29], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[26], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[30], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[27], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[31], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +ds_read_b32 v[vgprTmpValC+16], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+20], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+17], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+21], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+18], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+22], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+19], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+23], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+24], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+28], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+25], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+29], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+26], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+30], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+27], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+31], v[vgprLocalReadC+0] offset:3840 + +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+16] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+20] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+17] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+21] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+18] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+22] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+19] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+23] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+24] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+28] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+25] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+29] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+26] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+30] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+27] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+31] +Skip_Wave0_3_W: + + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7_W + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier + + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+0] offset:3840 + +s_waitcnt lgkmcnt(0) + +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+16] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+20] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+17] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+21] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+18] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+22] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+19] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+23] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+24] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+28] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+25] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+29] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+26] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+30] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+27] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+31] + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprLocalWriteC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + +Skip_Wave4_7_W: + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +/* +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+0], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+0], 16, v[vgprValuC+Nvoff+0] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +//v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +v_cmp_u_f32 s[sgprTemp0:sgprTemp1], v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] // check Nan +v_bfe_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], 16, 1 // Non-Nan case: store lsb of bf16 +v_add3_u32 v[vgprBFtemp], v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask2] // Non-Nan case: add lsb and the increment for rounding +v_cndmask_b32 v[vgprValuC+Nvoff+4], v[vgprBFtemp], v[vgprMask1], s[sgprTemp0:sgprTemp1] // +v_lshrrev_b32 v[vgprValuC+Nvoff+4], 16, v[vgprValuC+Nvoff+4] // convert C to bf16 +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +*/ +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..20a3bcab388fa52ed40e34e7ea0ec07906d7df3f --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,1868 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT16x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 +.set vgprValuB_X0_I0, 244 +.set vgprValuB_X1_I0, 248 +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x32_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x32_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 8 +.set MT1, 32 + +.set LDS_B_OFFSET, 512 +.set LDS_BLK_OFFSET, 2560 +.set LDS_BLK_OFFSET_64Kmasked, 2560 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 32 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +//s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 3 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x10000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 2 //x4 load +.set LOG2_COALESCE_THREAD_A, 1 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 + +// mcc +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesA] // notice +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +//v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +//v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +//v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +//v_lshrrev_b32 v[vgprTemp0], 5, v[vgprTemp0] +//v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +//v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] + +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp1] + +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // + + +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprTemp1], s[sgprTemp1], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprGlobalReadOffsetB+1] +v_add_u32 v[vgprGlobalReadOffsetB+2], 16, v[vgprGlobalReadOffsetB+0] + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + +v_add_u32 v[vgprGlobalReadOffsetB+2], 16, v[vgprGlobalReadOffsetB+0] +v_add_u32 v[vgprTemp0], 16, v[vgprTemp0] +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+3], 0x1, v[vgprGlobalReadOffsetB+3] + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +//s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +//s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +//s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +//s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] + +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], 0x4000 +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] +// mcc +s_add_u32 s[sgprLDSMask], 0x4000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 0x4000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x10, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x20, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x30, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x110, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x120, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x130, v[vgprLocalReadAddrA] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +//s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad + +s_mov_b32 s[sgprLocalWriteAddrB], 0x0 +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET + +s_mul_i32 s[sgprTemp0], s[sgprGlWaveID], 0x4000 +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +s_mul_i32 s[sgprTemp0], s[sgprGlWaveID], 0x4000 +v_add_u32 v[vgprLocalReadAddrB], s[sgprTemp0], v[vgprLocalReadAddrB] + + +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] + + +s_mul_i32 s[sgprTemp0], s[sgprWaveID], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +s_mov_b32 s[sgprTemp1], s[sgprStridesA] +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] + +s_mul_i32 s[sgprTemp0], s[sgprWaveID], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp0:sgprTemp1], 2 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp3] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2:vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +.endm + + +.macro I32ToF16 vgprIn:req vgprZero:req vgprScale:req +v_sub_i32 v[\vgprIn], v[\vgprIn], v[\vgprZero] +v_cvt_f32_i32 v[\vgprIn], v[\vgprIn] +v_cvt_f16_f32 v[\vgprIn], v[\vgprIn] +v_mul_f16 v[\vgprIn], v[\vgprIn], v[\vgprScale] +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +//ds_read_m32x16_b16 v[vgprValuA_X0_I0+ 0:vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+0] offset:\off + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+1024 +.endm + +.macro LDS_LOADAB1 off:req + +//ds_read_m32x16_b16 v[vgprValuA_X1_I0+ 0:vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+1056 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +// mcc +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 7 + +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL + + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLDSMask] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_cmp_gt_u32 s[sgprTemp0], s[sgprLDSMask] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_cmp_gt_u32 s[sgprTemp0], s[sgprLDSMask] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(12) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(9) +s_barrier +Last3: +s_waitcnt vmcnt(6) +s_barrier +Last2: +s_waitcnt vmcnt(3) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 32 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 8 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +//s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +//s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +//v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 + +GLOBAL_INC_Scale_Zero +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + +/* +.if debug_buffer + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrB+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X0_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif +*/ + + + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 + + + +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] +v_mov_b32 v[vgprLocalReadC], v[vgprLocalWriteC] +v_add_u32 v[vgprLocalReadC+1], 0x4000, v[vgprLocalReadC] +v_add_u32 v[vgprLocalReadC+2], 0x8000, v[vgprLocalReadC] +v_add_u32 v[vgprLocalReadC+3], 0xC000, v[vgprLocalReadC] + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_mul_u32_u24 v[vgprTemp1], 0x4000, v[vgprTemp1] +v_add_u32 v[vgprLocalWriteC], v[vgprLocalWriteC], v[vgprTemp1] + + + +s_barrier + +s_cmp_eq_u32 s[sgprWaveID], 0 +s_cbranch_scc0 Skip_Wave0 + +//ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +//ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +//ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +//ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[8], v[1] +v_mov_b32 v[12], v[5] + +//ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:0 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:0 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:0 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +//ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:256 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:256 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:256 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +//ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:512 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:512 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:512 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +//ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:768 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:768 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:768 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave0: + + +s_cmp_eq_u32 s[sgprWaveID], 1 +s_cbranch_scc0 Skip_Wave1 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +//ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +//ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +//ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +//ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[1], v[2] +v_mov_b32 v[5], v[6] +v_mov_b32 v[9], v[3] +v_mov_b32 v[13], v[7] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:1024 +//ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:1024 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:1024 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:1024 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:1280 +//ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:1280 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:1280 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:1280 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:1536 +//ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:1536 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:1536 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:1536 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:1792 +//ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:1792 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:1792 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:1792 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave1: + +s_cmp_eq_u32 s[sgprWaveID], 2 +s_cbranch_scc0 Skip_Wave2 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +//ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +//ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +//ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +//ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[2], v[8] +v_mov_b32 v[6], v[12] +v_mov_b32 v[10], v[9] +v_mov_b32 v[14], v[13] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:2048 +//ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:2048 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:2048 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:2304 +//ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:2304 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:2304 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:2560 +//ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:2560 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:2560 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:2816 +//ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:2816 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:2816 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave2: + +s_cmp_eq_u32 s[sgprWaveID], 3 +s_cbranch_scc0 Skip_Wave3 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +//ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +//ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +//ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +//ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[3], v[10] +v_mov_b32 v[7], v[14] +v_mov_b32 v[11], v[11] +v_mov_b32 v[15], v[15] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:3072 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:3072 +//ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:3072 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:3328 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:3328 +//ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:3328 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:3584 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:3584 +//ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:3584 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:3840 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:3840 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:3840 +//ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:3840 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave3: +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+4] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+8] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC+12] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 32 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 32*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +//s_mov_b32 s[sgprTemp1], s[sgprWaveID] +//s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +//v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..38c5a42cda9d13fd76bc17939cb3de8959ec6b40 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_w4a16.s @@ -0,0 +1,2335 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_WG16_16_2 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 8 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT256x32x32_SN_K1_PGR6_SB1_TT8_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 +.set vgprValuB_X0_I0, 244 +.set vgprValuB_X1_I0, 248 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 + +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 + + + +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF16, 180 + + +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 + + + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 + +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + + +.macro MMAC_32x32_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+2*2+0:vgprValuA_X2_I0+2*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+3*2+0:vgprValuA_X2_I0+3*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+4*2+0:vgprValuA_X2_I0+4*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+5*2+0:vgprValuA_X2_I0+5*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+6*2+0:vgprValuA_X2_I0+6*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+7*2+0:vgprValuA_X2_I0+7*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X2_I0+2*2+0:vgprValuA_X2_I0+2*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X2_I0+3*2+0:vgprValuA_X2_I0+3*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X2_I0+4*2+0:vgprValuA_X2_I0+4*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X2_I0+5*2+0:vgprValuA_X2_I0+5*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X2_I0+6*2+0:vgprValuA_X2_I0+6*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X2_I0+7*2+0:vgprValuA_X2_I0+7*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x32_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+2*2+0:vgprValuA_X3_I0+2*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+3*2+0:vgprValuA_X3_I0+3*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+4*2+0:vgprValuA_X3_I0+4*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+5*2+0:vgprValuA_X3_I0+5*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+6*2+0:vgprValuA_X3_I0+6*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+7*2+0:vgprValuA_X3_I0+7*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X3_I0+2*2+0:vgprValuA_X3_I0+2*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X3_I0+3*2+0:vgprValuA_X3_I0+3*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X3_I0+4*2+0:vgprValuA_X3_I0+4*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X3_I0+5*2+0:vgprValuA_X3_I0+5*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X3_I0+6*2+0:vgprValuA_X3_I0+6*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X3_I0+7*2+0:vgprValuA_X3_I0+7*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // +s_setprio 0 +.endm + + +.set MT0, 128 +.set MT1, 32 +.set LDS_B_OFFSET, 8192 +.set LDS_BLK_OFFSET, 10240 +.set LDS_BLK_OFFSET_64Kmasked, 10240 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 32 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] + + +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0xf000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 16 //x4 load +.set LOG2_COALESCE_THREAD_A, 4 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 16, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] +s_mul_i32 s[sgprTemp0], s[sgprStridesA], 32 +v_add_u32 v[vgprGlobalReadOffsetA+1], v[vgprGlobalReadOffsetA+0], s[sgprTemp0] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 5, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 31, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 7, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +.set LDS_SUB_M_OFFSET, BPE*32 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 7, v[vgprTemp0] +v_lshlrev_b32 v0, 4, v0 +v_lshrrev_b32 v1, 2, v[vgprTemp0] + +v_and_b32 v4, LOADxWAVES_K_A-1, v1 +v_lshrrev_b32 v2, LOG2GLWAVES, v4 +v_and_b32 v3, GLWAVES-1, v4 +v_mul_u32_u24 v6, WAVE_LDS_OFFSET_A+0, v3 //lds WaveOffset + pad +v_mul_u32_u24 v2, MT0*BPE, v2 +v_add_u32 v4, v2, v6 +v_add_u32 v[vgprLocalReadAddrA], v4, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] +v_and_b32 v3, v3, 3 +v_mul_u32_u24 v[vgprTemp0], 64, v3 +v_mov_b32 v[vgprTemp1], v6 +v_add_u32 v[vgprTemp1], WAVE_LDS_OFFSET_A+0, v[vgprTemp1] //lds A/B offset +v_mov_b32 v[vgprTemp2], WAVE_LDS_OFFSET_A + +v_add_u32 v[vgprLocalReadAddrA+0], 0, v[vgprLocalReadAddrA] + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +ADDR_WRAP v[vgprLocalReadAddrA+0] +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x210 // L1477 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+1], 0x100, v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+2], 0x120, v[vgprLocalReadAddrB] + +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 9, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x3, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 256 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetZero], 0x1, v[vgprGlobalReadOffsetZero] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp0:sgprTemp1], 2 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp3] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_A*4 +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_short_d16 v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_short_d16 v[vgprValuZeros+1], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:32 + +buffer_load_dwordx2 v[vgprValuScales+0:vgprValuScales+1], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +buffer_load_dwordx2 v[vgprValuScales+2:vgprValuScales+3], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:128 +.endm + +.macro GLOBAL_INC_Scale_Zero + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +.endm + + +.macro I32ToF16 vgprIn:req vgprZero:req vgprScale:req +v_sub_i32 v[\vgprIn], v[\vgprIn], v[\vgprZero] +v_cvt_f32_i32 v[\vgprIn], v[\vgprIn] +v_cvt_f16_f32 v[\vgprIn], v[\vgprIn] +v_mul_f16 v[\vgprIn], v[\vgprIn], v[\vgprScale] +.endm + +.macro UnPackB32ToTwoF16 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+4], 16, v[\vgprScale+2] +v_lshrrev_b32 v[\vgprOut+4], 16, v[\vgprOut+4] +v_lshrrev_b32 v[\vgprOut+5], 16, v[\vgprScale+2] + +v_lshlrev_b32 v[\vgprOut+6], 16, v[\vgprScale+3] +v_lshrrev_b32 v[\vgprOut+6], 16, v[\vgprOut+6] +v_lshrrev_b32 v[\vgprOut+7], 16, v[\vgprScale+3] + +v_lshlrev_b32 v[\vgprOut+2], 16, v[\vgprScale+1] +v_lshrrev_b32 v[\vgprOut+2], 16, v[\vgprOut+2] +v_lshrrev_b32 v[\vgprOut+3], 16, v[\vgprScale+1] + +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale+0] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +.endm + +.macro UnPackB32To8B4 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+3], 12, v[\vgprZero] +v_lshrrev_b32 v[\vgprOut+2], 8, v[\vgprZero] +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_lshrrev_b32 v[\vgprOut+0], 0, v[\vgprZero] + +v_and_b32 v[\vgprOut+0], v[\vgprOut+0], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_and_b32 v[\vgprOut+2], v[\vgprOut+2], 0xf +v_and_b32 v[\vgprOut+3], v[\vgprOut+3], 0xf +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 28, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 24, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 20, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 12, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 8, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 0, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0], 0xf +v_and_b32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1], 0xf +v_and_b32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2], 0xf +v_and_b32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3], 0xf +v_and_b32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4], 0xf +v_and_b32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5], 0xf +v_and_b32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6], 0xf +v_and_b32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7], 0xf + +I32ToF16 vgprValuA_X0_H0+0 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+1 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+2 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+3 \vgprZero+3 \vgprScale+3 +I32ToF16 vgprValuA_X0_H0+4 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+5 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+6 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+7 \vgprZero+3 \vgprScale+3 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+4] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+5] +v_pack_b32_f16 v[\vgprPack+4], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+6], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+7] + +v_lshrrev_b32 v[vgprValuA_X0_H0+15], 28, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+14], 24, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+13], 20, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+12], 16, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+11], 12, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+10], 8, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+9], 4, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+8], 0, v[\vgprIn+1] + +v_and_b32 v[vgprValuA_X0_H0+8], v[vgprValuA_X0_H0+8], 0xf +v_and_b32 v[vgprValuA_X0_H0+9], v[vgprValuA_X0_H0+9], 0xf +v_and_b32 v[vgprValuA_X0_H0+10], v[vgprValuA_X0_H0+10], 0xf +v_and_b32 v[vgprValuA_X0_H0+11], v[vgprValuA_X0_H0+11], 0xf +v_and_b32 v[vgprValuA_X0_H0+12], v[vgprValuA_X0_H0+12], 0xf +v_and_b32 v[vgprValuA_X0_H0+13], v[vgprValuA_X0_H0+13], 0xf +v_and_b32 v[vgprValuA_X0_H0+14], v[vgprValuA_X0_H0+14], 0xf +v_and_b32 v[vgprValuA_X0_H0+15], v[vgprValuA_X0_H0+15], 0xf + +I32ToF16 vgprValuA_X0_H0+8 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+9 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+10 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+11 \vgprZero+3 \vgprScale+3 +I32ToF16 vgprValuA_X0_H0+12 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+13 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+14 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+15 \vgprZero+3 \vgprScale+3 + +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+8], v[vgprValuA_X0_H0+12] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+9], v[vgprValuA_X0_H0+13] +v_pack_b32_f16 v[\vgprPack+5], v[vgprValuA_X0_H0+10], v[vgprValuA_X0_H0+14] +v_pack_b32_f16 v[\vgprPack+7], v[vgprValuA_X0_H0+11], v[vgprValuA_X0_H0+15] + +v_lshrrev_b32 v[vgprValuA_X0_H0+23], 28, v[\vgprIn+2] +v_lshrrev_b32 v[vgprValuA_X0_H0+22], 24, v[\vgprIn+2] +v_lshrrev_b32 v[vgprValuA_X0_H0+21], 20, v[\vgprIn+2] +v_lshrrev_b32 v[vgprValuA_X0_H0+20], 16, v[\vgprIn+2] +v_lshrrev_b32 v[vgprValuA_X0_H0+19], 12, v[\vgprIn+2] +v_lshrrev_b32 v[vgprValuA_X0_H0+18], 8, v[\vgprIn+2] +v_lshrrev_b32 v[vgprValuA_X0_H0+17], 4, v[\vgprIn+2] +v_lshrrev_b32 v[vgprValuA_X0_H0+16], 0, v[\vgprIn+2] + +v_and_b32 v[vgprValuA_X0_H0+16], v[vgprValuA_X0_H0+16], 0xf +v_and_b32 v[vgprValuA_X0_H0+17], v[vgprValuA_X0_H0+17], 0xf +v_and_b32 v[vgprValuA_X0_H0+18], v[vgprValuA_X0_H0+18], 0xf +v_and_b32 v[vgprValuA_X0_H0+19], v[vgprValuA_X0_H0+19], 0xf +v_and_b32 v[vgprValuA_X0_H0+20], v[vgprValuA_X0_H0+20], 0xf +v_and_b32 v[vgprValuA_X0_H0+21], v[vgprValuA_X0_H0+21], 0xf +v_and_b32 v[vgprValuA_X0_H0+22], v[vgprValuA_X0_H0+22], 0xf +v_and_b32 v[vgprValuA_X0_H0+23], v[vgprValuA_X0_H0+23], 0xf + +I32ToF16 vgprValuA_X0_H0+16 \vgprZero+4 \vgprScale+4 +I32ToF16 vgprValuA_X0_H0+17 \vgprZero+5 \vgprScale+5 +I32ToF16 vgprValuA_X0_H0+18 \vgprZero+6 \vgprScale+6 +I32ToF16 vgprValuA_X0_H0+19 \vgprZero+7 \vgprScale+7 +I32ToF16 vgprValuA_X0_H0+20 \vgprZero+4 \vgprScale+4 +I32ToF16 vgprValuA_X0_H0+21 \vgprZero+5 \vgprScale+5 +I32ToF16 vgprValuA_X0_H0+22 \vgprZero+6 \vgprScale+6 +I32ToF16 vgprValuA_X0_H0+23 \vgprZero+7 \vgprScale+7 + +v_pack_b32_f16 v[\vgprPack+8], v[vgprValuA_X0_H0+16], v[vgprValuA_X0_H0+20] +v_pack_b32_f16 v[\vgprPack+10], v[vgprValuA_X0_H0+17], v[vgprValuA_X0_H0+21] +v_pack_b32_f16 v[\vgprPack+12], v[vgprValuA_X0_H0+18], v[vgprValuA_X0_H0+22] +v_pack_b32_f16 v[\vgprPack+14], v[vgprValuA_X0_H0+19], v[vgprValuA_X0_H0+23] + +v_lshrrev_b32 v[vgprValuA_X0_H0+31], 28, v[\vgprIn+3] +v_lshrrev_b32 v[vgprValuA_X0_H0+30], 24, v[\vgprIn+3] +v_lshrrev_b32 v[vgprValuA_X0_H0+29], 20, v[\vgprIn+3] +v_lshrrev_b32 v[vgprValuA_X0_H0+28], 16, v[\vgprIn+3] +v_lshrrev_b32 v[vgprValuA_X0_H0+27], 12, v[\vgprIn+3] +v_lshrrev_b32 v[vgprValuA_X0_H0+26], 8, v[\vgprIn+3] +v_lshrrev_b32 v[vgprValuA_X0_H0+25], 4, v[\vgprIn+3] +v_lshrrev_b32 v[vgprValuA_X0_H0+24], 0, v[\vgprIn+3] + +v_and_b32 v[vgprValuA_X0_H0+24], v[vgprValuA_X0_H0+24], 0xf +v_and_b32 v[vgprValuA_X0_H0+25], v[vgprValuA_X0_H0+25], 0xf +v_and_b32 v[vgprValuA_X0_H0+26], v[vgprValuA_X0_H0+26], 0xf +v_and_b32 v[vgprValuA_X0_H0+27], v[vgprValuA_X0_H0+27], 0xf +v_and_b32 v[vgprValuA_X0_H0+28], v[vgprValuA_X0_H0+28], 0xf +v_and_b32 v[vgprValuA_X0_H0+29], v[vgprValuA_X0_H0+29], 0xf +v_and_b32 v[vgprValuA_X0_H0+30], v[vgprValuA_X0_H0+30], 0xf +v_and_b32 v[vgprValuA_X0_H0+31], v[vgprValuA_X0_H0+31], 0xf + +I32ToF16 vgprValuA_X0_H0+24 \vgprZero+4 \vgprScale+4 +I32ToF16 vgprValuA_X0_H0+25 \vgprZero+5 \vgprScale+5 +I32ToF16 vgprValuA_X0_H0+26 \vgprZero+6 \vgprScale+6 +I32ToF16 vgprValuA_X0_H0+27 \vgprZero+7 \vgprScale+7 +I32ToF16 vgprValuA_X0_H0+28 \vgprZero+4 \vgprScale+4 +I32ToF16 vgprValuA_X0_H0+29 \vgprZero+5 \vgprScale+5 +I32ToF16 vgprValuA_X0_H0+30 \vgprZero+6 \vgprScale+6 +I32ToF16 vgprValuA_X0_H0+31 \vgprZero+7 \vgprScale+7 + +v_pack_b32_f16 v[\vgprPack+9], v[vgprValuA_X0_H0+24], v[vgprValuA_X0_H0+28] +v_pack_b32_f16 v[\vgprPack+11], v[vgprValuA_X0_H0+25], v[vgprValuA_X0_H0+29] +v_pack_b32_f16 v[\vgprPack+13], v[vgprValuA_X0_H0+26], v[vgprValuA_X0_H0+30] +v_pack_b32_f16 v[\vgprPack+15], v[vgprValuA_X0_H0+27], v[vgprValuA_X0_H0+31] + +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_m32x16_b16 v[vgprValuA_X0_I0+ 0:vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+0] offset:\off +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_m32x16_b16 v[vgprValuA_X1_I0+ 0:vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+0] offset:\off+4096 +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB+2] offset:\off+0 + +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 + +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(12) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(9) +s_barrier +Last3: +s_waitcnt vmcnt(6) +s_barrier +Last2: +s_waitcnt vmcnt(3) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 512 + +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x3, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 128 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 // initC +v_mov_b32 v[vgprValuC+17], 0x0 // initC +v_mov_b32 v[vgprValuC+18], 0x0 // initC +v_mov_b32 v[vgprValuC+19], 0x0 // initC +v_mov_b32 v[vgprValuC+20], 0x0 // initC +v_mov_b32 v[vgprValuC+21], 0x0 // initC +v_mov_b32 v[vgprValuC+22], 0x0 // initC +v_mov_b32 v[vgprValuC+23], 0x0 // initC +v_mov_b32 v[vgprValuC+24], 0x0 // initC +v_mov_b32 v[vgprValuC+25], 0x0 // initC +v_mov_b32 v[vgprValuC+26], 0x0 // initC +v_mov_b32 v[vgprValuC+27], 0x0 // initC +v_mov_b32 v[vgprValuC+28], 0x0 // initC +v_mov_b32 v[vgprValuC+29], 0x0 // initC +v_mov_b32 v[vgprValuC+30], 0x0 // initC +v_mov_b32 v[vgprValuC+31], 0x0 // initC +v_mov_b32 v[vgprValuC+32], 0x0 // initC +v_mov_b32 v[vgprValuC+33], 0x0 // initC +v_mov_b32 v[vgprValuC+34], 0x0 // initC +v_mov_b32 v[vgprValuC+35], 0x0 // initC +v_mov_b32 v[vgprValuC+36], 0x0 // initC +v_mov_b32 v[vgprValuC+37], 0x0 // initC +v_mov_b32 v[vgprValuC+38], 0x0 // initC +v_mov_b32 v[vgprValuC+39], 0x0 // initC +v_mov_b32 v[vgprValuC+40], 0x0 // initC +v_mov_b32 v[vgprValuC+41], 0x0 // initC +v_mov_b32 v[vgprValuC+42], 0x0 // initC +v_mov_b32 v[vgprValuC+43], 0x0 // initC +v_mov_b32 v[vgprValuC+44], 0x0 // initC +v_mov_b32 v[vgprValuC+45], 0x0 // initC +v_mov_b32 v[vgprValuC+46], 0x0 // initC +v_mov_b32 v[vgprValuC+47], 0x0 // initC +v_mov_b32 v[vgprValuC+48], 0x0 // initC +v_mov_b32 v[vgprValuC+49], 0x0 // initC +v_mov_b32 v[vgprValuC+50], 0x0 // initC +v_mov_b32 v[vgprValuC+51], 0x0 // initC +v_mov_b32 v[vgprValuC+52], 0x0 // initC +v_mov_b32 v[vgprValuC+53], 0x0 // initC +v_mov_b32 v[vgprValuC+54], 0x0 // initC +v_mov_b32 v[vgprValuC+55], 0x0 // initC +v_mov_b32 v[vgprValuC+56], 0x0 // initC +v_mov_b32 v[vgprValuC+57], 0x0 // initC +v_mov_b32 v[vgprValuC+58], 0x0 // initC +v_mov_b32 v[vgprValuC+59], 0x0 // initC +v_mov_b32 v[vgprValuC+60], 0x0 // initC +v_mov_b32 v[vgprValuC+61], 0x0 // initC +v_mov_b32 v[vgprValuC+62], 0x0 // initC +v_mov_b32 v[vgprValuC+63], 0x0 // initC +GLOBAL_INC_Scale_Zero + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 + +UnPackB32ToTwoF16 vgprValuScales+0 vgprValuScalesF16+0 +UnPackB32To8B4 vgprValuZeros+0 vgprValuZerosI32+0 +UnPackB32To8B4 vgprValuZeros+1 vgprValuZerosI32+4 + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier + +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X3_I0 +UnPackB32ToTwoF16 vgprValuScales+0 vgprValuScalesF16+0 +UnPackB32To8B4 vgprValuZeros+0 vgprValuZerosI32+0 +UnPackB32To8B4 vgprValuZeros+1 vgprValuZerosI32+4 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X3_I0 +UnPackB32ToTwoF16 vgprValuScales+0 vgprValuScalesF16+0 +UnPackB32To8B4 vgprValuZeros+0 vgprValuZerosI32+0 +UnPackB32To8B4 vgprValuZeros+1 vgprValuZerosI32+4 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(3) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X3_I0 +UnPackB32ToTwoF16 vgprValuScales+0 vgprValuScalesF16+0 +UnPackB32To8B4 vgprValuZeros+0 vgprValuZerosI32+0 +UnPackB32To8B4 vgprValuZeros+1 vgprValuZerosI32+4 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + + +//reload sgprs value from vgprs +v_readlane_b32 s[sgprSrdA+0], v[vgprKeepSgprValue], laneSrdA0 +v_readlane_b32 s[sgprSrdA+1], v[vgprKeepSgprValue], laneSrdA1 +v_readlane_b32 s[sgprSrdA+2], v[vgprKeepSgprValue], laneSrdA2 +v_readlane_b32 s[sgprSrdB+0], v[vgprKeepSgprValue], laneSrdB0 +v_readlane_b32 s[sgprSrdB+1], v[vgprKeepSgprValue], laneSrdB1 +v_readlane_b32 s[sgprSrdB+2], v[vgprKeepSgprValue], laneSrdB2 +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +s_waitcnt vmcnt(0) +s_waitcnt lgkmcnt(0) +s_barrier +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_lshr_b32 s[sgprTemp0], s[sgprSizesSum], LOG2DEPTHU // +s_and_b32 s[sgprTemp1], s[sgprSizesSum], DEPTHU-1 +s_cselect_b32 s[sgprTemp1], 0, 1 //if has tail noneed -1 +s_sub_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] //if has tail noneed -1, get increase blocks + +s_mul_i32 s[sgprTemp2], s[sgprTemp0], s[sgprGlobalReadIncsA] +s_mul_hi_i32 s[sgprTemp3], s[sgprTemp0], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp3] +s_sub_u32 s[sgprSrdA+2], s[sgprSrdA+2], s[sgprTemp2] + +s_mul_i32 s[sgprTemp2], s[sgprTemp0], s[sgprGlobalReadIncsB] +s_mul_hi_i32 s[sgprTemp3], s[sgprTemp0], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp3] +//s_sub_u32 s[sgprSrdB+2], s[sgprSrdB+2], s[sgprTemp2] + +s_and_b32 s[sgprLoopCounterL], s[sgprSizesSum], DEPTHU-1 +s_cmp_eq_i32 s[sgprLoopCounterL], 0 +s_cmov_b32 s[sgprLoopCounterL], DEPTHU //如果没有多余k就设为整数k + +s_mul_i32 s[sgprTemp3], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprTemp3], s[sgprSizesFree+0], s[sgprTemp3] +s_cmp_ge_u32 s[sgprTemp3], MT0 +s_cbranch_scc1 NOT_EDGE_A + +s_lshl_b32 s[sgprTemp3], s[sgprTemp3], LOG2BPE +s_lshr_b32 s[sgprTemp3], s[sgprTemp3], 2 +s_lshl_b32 s[sgprTemp3], s[sgprTemp3], 2 //计算16byte整数偏移 + + +//fp16 情况只会出现少2个byte情况,当m>=1时只是最后一列k会出现读不进的情况 +//当m=1时则会出现最后1列k读不进数据情况 +//fp16 由于是每个数是2 byte所以只会出现最后一列读不到数的情况,所以只用刷新最后一列K的数据 + +s_mov_b32 s[sgprTemp0], 1 +s_cmp_eq_i32 s[sgprSizesFree+0], 1 +s_cmov_b32 s[sgprTemp0], 1 //这个条件只有fp16需要 + +s_sub_u32 s[sgprTemp0], s[sgprLoopCounterL], s[sgprTemp0] //move to last K +s_mul_i32 s[sgprTemp1], s[sgprTemp0], s[sgprStridesA] +s_lshl_b32 s[sgprTemp1], s[sgprTemp1], LOG2BPE //乘上每个点 byte数 +v_and_b32 v[vgprTemp0], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprTemp0], LOG2BPE, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], s[sgprTemp1] //计算出最后一列k的global偏移 + +s_mul_i32 s[sgprTemp2], s[sgprTemp0], MT0 +s_lshl_b32 s[sgprTemp2], s[sgprTemp2], LOG2BPE //乘上每个点 byte数 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprTemp2], LOG2BPE, v[vgprTemp2] +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp2] //计算出最后一列k的lds偏移 + +v_add_u32 v[vgprTemp0], v[vgprTemp0], s[sgprTemp3] +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp3] //gl 和lds都加上N方向偏移 + +v_and_b32 v[vgprTemp3], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp3], 4, v[vgprTemp3] +v_lshlrev_b32 v[vgprTemp3], LOG2BPE, v[vgprTemp3] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp3], s[sgprStridesA] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] //向后读取4k列 用limit保证 +v_mul_u32_u24 v[vgprTemp1], MT0, v[vgprTemp3] +v_add_u32 v[vgprTemp2], v[vgprTemp2], v[vgprTemp1] //向后写入4k列 用mask保证不越界 + + +buffer_load_ushort v[vgprTemp1], v[vgprTemp0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 +v_cmp_lt_u32 s[sgprTemp0:sgprTemp1], v[vgprTemp0], s[sgprSrdA+2] //计算超出范围的地址,作为写入lds的mask +s_waitcnt vmcnt(0) +s_mov_b64 exec, s[sgprTemp0:sgprTemp1] //写多了会覆盖后面的数据,特别是A会覆盖B的数据 +ds_write_b16 v[vgprTemp2], v[vgprTemp1] +s_mov_b64 exec, 0xffffffffffffffff +s_waitcnt lgkmcnt(0) +NOT_EDGE_A: + + +s_mul_i32 s[sgprTemp3], s[sgprWorkGroup1], MT1 +s_sub_u32 s[sgprTemp3], s[sgprSizesFree+1], s[sgprTemp3] +//s_cmp_ge_u32 s[sgprTemp3], MT1 +s_cmp_ge_u32 s[sgprTemp3], 0 +s_cbranch_scc1 NOT_EDGE_B + +s_lshl_b32 s[sgprTemp3], s[sgprTemp3], LOG2BPE +s_lshr_b32 s[sgprTemp3], s[sgprTemp3], 4 +s_lshl_b32 s[sgprTemp3], s[sgprTemp3], 4 //计算16byte整数偏移 + +//fp16 情况只会出现少2个byte情况,当m>=1时只是最后一列k会出现读不进的情况 +//当m=1时则会出现最后1列k读不进数据情况 +//fp16 由于是每个数是2 byte所以只会出现最后一列读不到数的情况,所以只用刷新最后一列K的数据 + +s_mov_b32 s[sgprTemp0], 1 +s_cmp_eq_i32 s[sgprSizesFree+1], 1 +s_cmov_b32 s[sgprTemp0], 1 //这个条件只有fp16需要 + +s_sub_u32 s[sgprTemp0], s[sgprLoopCounterL], s[sgprTemp0] //move to last K +s_mul_i32 s[sgprTemp1], s[sgprTemp0], s[sgprStridesB] +s_lshl_b32 s[sgprTemp1], s[sgprTemp1], LOG2BPE //乘上每个点 byte数 +v_and_b32 v[vgprTemp0], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprTemp0], LOG2BPE, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], s[sgprTemp1] //计算出最后一列k的global偏移 + +s_mul_i32 s[sgprTemp2], s[sgprTemp0], MT1 +s_lshl_b32 s[sgprTemp2], s[sgprTemp2], LOG2BPE //乘上每个点 byte数 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprTemp2], LOG2BPE, v[vgprTemp2] +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp2] +v_add_u32 v[vgprTemp2], LDS_B_OFFSET, v[vgprTemp2] //计算出最后一列k的lds偏移 + +v_add_u32 v[vgprTemp0], v[vgprTemp0], s[sgprTemp3] +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp3] //gl 和lds都加上N方向偏移 + +v_and_b32 v[vgprTemp3], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp3], 4, v[vgprTemp3] +v_lshlrev_b32 v[vgprTemp3], LOG2BPE, v[vgprTemp3] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp3], s[sgprStridesB] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] //向后读取4k列 用limit保证 +v_mul_u32_u24 v[vgprTemp1], MT1, v[vgprTemp3] +v_add_u32 v[vgprTemp2], v[vgprTemp2], v[vgprTemp1] //向后写入4k列 用mask保证不越界 + +buffer_load_ushort v[vgprTemp1], v[vgprTemp0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 +v_cmp_lt_u32 s[sgprTemp0:sgprTemp1], v[vgprTemp0], s[sgprSrdB+2] //计算超出范围的地址,作为写入lds的mask +s_waitcnt vmcnt(0) +s_mov_b64 exec, s[sgprTemp0:sgprTemp1] //写多了会覆盖后面的数据,特别是A会覆盖B的数据 +ds_write_b16 v[vgprTemp2], v[vgprTemp1] +s_mov_b64 exec, 0xffffffffffffffff +s_waitcnt lgkmcnt(0) + +NOT_EDGE_B: + +//GLOBAL_LOAD_Scale_Zero +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 + +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) + +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF16+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 512 + +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 512*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mov_b32 s[sgprTemp1], s[sgprWaveID] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 128 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 32 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 33 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 34 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 35 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+8], s[sgprAlpha], v[vgprValuC+Nvoff+8] +v_cvt_f16_f32 v[vgprValuC+Nvoff+8], v[vgprValuC+Nvoff+8] +buffer_store_short v[vgprValuC+Nvoff+8], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+12], s[sgprAlpha], v[vgprValuC+Nvoff+12] +v_cvt_f16_f32 v[vgprValuC+Nvoff+12], v[vgprValuC+Nvoff+12] +buffer_store_short v[vgprValuC+Nvoff+12], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 122, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 122, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+16], s[sgprAlpha], v[vgprValuC+Nvoff+16] +v_cvt_f16_f32 v[vgprValuC+Nvoff+16], v[vgprValuC+Nvoff+16] +buffer_store_short v[vgprValuC+Nvoff+16], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+20], s[sgprAlpha], v[vgprValuC+Nvoff+20] +v_cvt_f16_f32 v[vgprValuC+Nvoff+20], v[vgprValuC+Nvoff+20] +buffer_store_short v[vgprValuC+Nvoff+20], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+24], s[sgprAlpha], v[vgprValuC+Nvoff+24] +v_cvt_f16_f32 v[vgprValuC+Nvoff+24], v[vgprValuC+Nvoff+24] +buffer_store_short v[vgprValuC+Nvoff+24], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] + +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+28], s[sgprAlpha], v[vgprValuC+Nvoff+28] +v_cvt_f16_f32 v[vgprValuC+Nvoff+28], v[vgprValuC+Nvoff+28] +buffer_store_short v[vgprValuC+Nvoff+28], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..0dfc7ca3fd25df5873a3ed4d5bdb7f4ee8b072a9 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,1708 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT32x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 +.set vgprValuB_X0_I0, 244 +.set vgprValuB_X1_I0, 248 +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x32_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x32_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 16 +.set MT1, 32 + +.set LDS_B_OFFSET, 1024 +.set LDS_BLK_OFFSET, 3072 +.set LDS_BLK_OFFSET_64Kmasked, 3072 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 32 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +//s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 3 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x10000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 2 //x4 load +.set LOG2_COALESCE_THREAD_A, 1 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 16, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 + +// mcc +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesA] // notice +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp1] + +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // + + +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprTemp1], s[sgprTemp1], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprGlobalReadOffsetB+1] +v_add_u32 v[vgprGlobalReadOffsetB+2], 16, v[vgprGlobalReadOffsetB+0] + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + +v_add_u32 v[vgprGlobalReadOffsetB+2], 16, v[vgprGlobalReadOffsetB+0] +v_add_u32 v[vgprTemp0], 16, v[vgprTemp0] +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+3], 0x1, v[vgprGlobalReadOffsetB+3] + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + + +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], 0x3000 +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] +// mcc +s_add_u32 s[sgprLDSMask], 0x3000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x80, v1 +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +//s_mul_i32 s[sgprTemp0], s[sgprWaveID], 0x3000 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0x3000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 4 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x20, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x40, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x60, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x200, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x220, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x260, v[vgprLocalReadAddrA] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +//s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad + +s_mov_b32 s[sgprLocalWriteAddrB], 0x0 +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET + +s_mul_i32 s[sgprTemp0], s[sgprGlWaveID], 0x3000 +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +s_mul_i32 s[sgprTemp0], s[sgprGlWaveID], 0x3000 +v_add_u32 v[vgprLocalReadAddrB], s[sgprTemp0], v[vgprLocalReadAddrB] + + +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] + +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] +s_lshr_b32 s[sgprTemp2], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], 64, s[sgprTemp2] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +s_mov_b32 s[sgprTemp1], s[sgprStridesA] +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] +s_lshr_b32 s[sgprTemp2], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], 16, s[sgprTemp2] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp0:sgprTemp1], 2 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp3] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2:vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], s[sgprTemp1] +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +.endm + + +.macro I32ToF16 vgprIn:req vgprZero:req vgprScale:req +v_sub_i32 v[\vgprIn], v[\vgprIn], v[\vgprZero] +v_cvt_f32_i32 v[\vgprIn], v[\vgprIn] +v_cvt_f16_f32 v[\vgprIn], v[\vgprIn] +v_mul_f16 v[\vgprIn], v[\vgprIn], v[\vgprScale] +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+1024 +.endm + +.macro LDS_LOADAB1 off:req +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+1056 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +// mcc +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 7 + +s_min_u32 s[sgprLoopCntCommon], 4, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprStridesA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + +v_mov_b32 v[vgprDebugTmp], s[sgprLDSMask] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSizesSum+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +/* +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +*/ +.endif + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprLDSMask] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprLDSMask] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 2 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(6) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_waitcnt vmcnt(3) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 64 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 + +GLOBAL_INC_Scale_Zero +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +//s_cmp_ge_u32 s[sgprWaveID], 4 +//s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] + +s_lshr_b32 s[sgprTemp2], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], 4096, s[sgprTemp2] +v_add_u32 v[vgprLocalWriteC], s[sgprTemp2], v[vgprLocalWriteC] + + +v_mov_b32 v[vgprLocalReadC], v[vgprLocalWriteC] +v_add_u32 v[vgprLocalReadC+1], 0x3000, v[vgprLocalReadC] +v_add_u32 v[vgprLocalReadC+2], 0x6000, v[vgprLocalReadC] +v_add_u32 v[vgprLocalReadC+3], 0x9000, v[vgprLocalReadC] + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_mul_u32_u24 v[vgprTemp1], 0x3000, v[vgprTemp1] +v_add_u32 v[vgprLocalWriteC], v[vgprLocalWriteC], v[vgprTemp1] + + +s_barrier +s_and_b32 s[sgprTemp3], s[sgprWaveID], 3 + +s_cmp_eq_u32 s[sgprTemp3], 0 +s_cbranch_scc0 Skip_Wave0 + +//ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +//ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +//ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +//ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[8], v[1] +v_mov_b32 v[12], v[5] + +//ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:0 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:0 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:0 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +//ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:256 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:256 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:256 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +//ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:512 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:512 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:512 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +//ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:768 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:768 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:768 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave0: + + +s_cmp_eq_u32 s[sgprTemp3], 1 +s_cbranch_scc0 Skip_Wave1 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +//ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +//ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +//ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +//ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[1], v[2] +v_mov_b32 v[5], v[6] +v_mov_b32 v[9], v[3] +v_mov_b32 v[13], v[7] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:1024 +//ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:1024 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:1024 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:1024 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:1280 +//ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:1280 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:1280 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:1280 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:1536 +//ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:1536 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:1536 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:1536 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:1792 +//ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:1792 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:1792 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:1792 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave1: + +s_cmp_eq_u32 s[sgprTemp3], 2 +s_cbranch_scc0 Skip_Wave2 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +//ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +//ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +//ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +//ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[2], v[8] +v_mov_b32 v[6], v[12] +v_mov_b32 v[10], v[9] +v_mov_b32 v[14], v[13] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:2048 +//ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:2048 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:2048 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:2304 +//ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:2304 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:2304 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:2560 +//ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:2560 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:2560 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:2816 +//ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:2816 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:2816 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave2: + +s_cmp_eq_u32 s[sgprTemp3], 3 +s_cbranch_scc0 Skip_Wave3 + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +//ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +//ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +//ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +//ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +v_mov_b32 v[3], v[10] +v_mov_b32 v[7], v[14] +v_mov_b32 v[11], v[11] +v_mov_b32 v[15], v[15] + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+1] offset:3072 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+2] offset:3072 +//ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+3] offset:3072 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+1] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+2] +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+3] + + +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+1] offset:3328 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+2] offset:3328 +//ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+3] offset:3328 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+5] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+6] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+7] + +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+1] offset:3584 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+2] offset:3584 +//ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+3] offset:3584 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+11] + +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:3840 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+1] offset:3840 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+2] offset:3840 +//ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+3] offset:3840 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+15] +Skip_Wave3: + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 32 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 32*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +//s_mov_b32 s[sgprTemp1], s[sgprWaveID] +//s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +//v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..435e88ee0c906fada2c062df8e6d236a90076f35 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,2500 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 61440 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 61440 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_8_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 + + +.set vgprValuB_X0_I0, 64 +.set vgprValuB_X1_I0, 80 + +.set vgprValuA_X0_H0, 96 +.set vgprValuA_X1_H0, 104 +.set vgprValuA_X2_I0, 112 +.set vgprValuA_X3_I0, 116 + + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + + +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 128 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 10240 +.set LDS_BLK_OFFSET_64Kmasked, 10240 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 128 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0xF000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +//v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprGlobalReadOffsetB+1] +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprGlobalReadOffsetB+0] + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] + +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprTemp0+0] +v_mul_lo_u32 v[vgprGlobalReadOffsetB+3], s[sgprStrideStruct], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1], v[vgprGlobalReadOffsetB+3] + +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+3], 0x1, v[vgprGlobalReadOffsetB+3] +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +/* +v_and_b32 v3, 1, v1 +v_mul_u32_u24 v3, 0x40, v3 +v_mul_u32_u24 v2, WAVE_LDS_OFFSET_A+0, v1 +v_add_u32 v2, v3, v2 +*/ +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + + + +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B*4 +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2:vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + + +.macro I32ToF16 vgprIn:req vgprZero:req vgprScale:req +v_sub_i32 v[\vgprIn], v[\vgprIn], v[\vgprZero] +v_cvt_f32_i32 v[\vgprIn], v[\vgprIn] +v_mul_f32 v[\vgprIn], v[\vgprIn], v[\vgprScale] +v_cvt_f16_f32 v[\vgprIn], v[\vgprIn] +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 + +ds_read_b64 v[vgprValuB_X0_I0+ 8:vgprValuB_X0_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4096 +ds_read_b64 v[vgprValuB_X0_I0+ 10:vgprValuB_X0_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4352 +ds_read_b64 v[vgprValuB_X0_I0+ 12:vgprValuB_X0_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4608 +ds_read_b64 v[vgprValuB_X0_I0+ 14:vgprValuB_X0_I0+ 15], v[vgprLocalReadAddrB+1] offset:\off+4096 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 + +ds_read_b64 v[vgprValuB_X1_I0+ 8:vgprValuB_X1_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4128 +ds_read_b64 v[vgprValuB_X1_I0+ 10:vgprValuB_X1_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4384 +ds_read_b64 v[vgprValuB_X1_I0+ 12:vgprValuB_X1_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4640 +ds_read_b64 v[vgprValuB_X1_I0+ 14:vgprValuB_X1_I0+ 15], v[vgprLocalReadAddrB+2] offset:\off+4096 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(12) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(9) +s_barrier +Last3: +s_waitcnt vmcnt(6) +s_barrier +Last2: +s_waitcnt vmcnt(3) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 +v_mov_b32 v[vgprValuC+32], 0x0 +v_mov_b32 v[vgprValuC+33], 0x0 +v_mov_b32 v[vgprValuC+34], 0x0 +v_mov_b32 v[vgprValuC+35], 0x0 +v_mov_b32 v[vgprValuC+36], 0x0 +v_mov_b32 v[vgprValuC+37], 0x0 +v_mov_b32 v[vgprValuC+38], 0x0 +v_mov_b32 v[vgprValuC+39], 0x0 +v_mov_b32 v[vgprValuC+40], 0x0 +v_mov_b32 v[vgprValuC+41], 0x0 +v_mov_b32 v[vgprValuC+42], 0x0 +v_mov_b32 v[vgprValuC+43], 0x0 +v_mov_b32 v[vgprValuC+44], 0x0 +v_mov_b32 v[vgprValuC+45], 0x0 +v_mov_b32 v[vgprValuC+46], 0x0 +v_mov_b32 v[vgprValuC+47], 0x0 +v_mov_b32 v[vgprValuC+48], 0x0 +v_mov_b32 v[vgprValuC+49], 0x0 +v_mov_b32 v[vgprValuC+50], 0x0 +v_mov_b32 v[vgprValuC+51], 0x0 +v_mov_b32 v[vgprValuC+52], 0x0 +v_mov_b32 v[vgprValuC+53], 0x0 +v_mov_b32 v[vgprValuC+54], 0x0 +v_mov_b32 v[vgprValuC+55], 0x0 +v_mov_b32 v[vgprValuC+56], 0x0 +v_mov_b32 v[vgprValuC+57], 0x0 +v_mov_b32 v[vgprValuC+58], 0x0 +v_mov_b32 v[vgprValuC+59], 0x0 +v_mov_b32 v[vgprValuC+60], 0x0 +v_mov_b32 v[vgprValuC+61], 0x0 +v_mov_b32 v[vgprValuC+62], 0x0 +v_mov_b32 v[vgprValuC+63], 0x0 + +GLOBAL_INC_Scale_Zero + + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 + + + +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 + +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) + + +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mov_b32 s[sgprTemp1], s[sgprWaveID] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + + + + + + + + + +.set Nvoff, 32 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 33 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 34 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 35 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 40 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 41 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 42 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 43 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 48 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 49 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 50 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 51 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 56 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 57 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 58 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 59 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s new file mode 100644 index 0000000000000000000000000000000000000000..dac1637507aae3c435df31264a88119293c5053e --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s @@ -0,0 +1,2552 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x128x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +.set vgprValuB_X0_I0, 64 +.set vgprValuB_X1_I0, 80 + +.set vgprValuA_X0_H0, 96 +.set vgprValuA_X1_H0, 104 +.set vgprValuA_X2_I0, 112 +.set vgprValuA_X3_I0, 116 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 211 +.set vgprGlobalReadOffsetB, 212 +.set vgprGlobalReadOffsetB1, 216 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+4*2+0:vgprValuB_X0_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+5*2+0:vgprValuB_X0_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+6*2+0:vgprValuB_X0_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+7*2+0:vgprValuB_X0_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // + +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + +v_mmac_f32_16x16x16_f16 v[vgprValuC+8*4+0:vgprValuC+8*4+1:vgprValuC+8*4+2:vgprValuC+8*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+8*4+0: vgprValuC+8*4+1: vgprValuC+8*4+2: vgprValuC+8*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+9*4+0:vgprValuC+9*4+1:vgprValuC+9*4+2:vgprValuC+9*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+4*2+0:vgprValuB_X1_I0+4*2+1] v[vgprValuC+9*4+0: vgprValuC+9*4+1: vgprValuC+9*4+2: vgprValuC+9*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+10*4+0:vgprValuC+10*4+1:vgprValuC+10*4+2:vgprValuC+10*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+10*4+0: vgprValuC+10*4+1: vgprValuC+10*4+2: vgprValuC+10*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+11*4+0:vgprValuC+11*4+1:vgprValuC+11*4+2:vgprValuC+11*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+5*2+0:vgprValuB_X1_I0+5*2+1] v[vgprValuC+11*4+0: vgprValuC+11*4+1: vgprValuC+11*4+2: vgprValuC+11*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+12*4+0:vgprValuC+12*4+1:vgprValuC+12*4+2:vgprValuC+12*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+12*4+0: vgprValuC+12*4+1: vgprValuC+12*4+2: vgprValuC+12*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+13*4+0:vgprValuC+13*4+1:vgprValuC+13*4+2:vgprValuC+13*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+6*2+0:vgprValuB_X1_I0+6*2+1] v[vgprValuC+13*4+0: vgprValuC+13*4+1: vgprValuC+13*4+2: vgprValuC+13*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+14*4+0:vgprValuC+14*4+1:vgprValuC+14*4+2:vgprValuC+14*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+14*4+0: vgprValuC+14*4+1: vgprValuC+14*4+2: vgprValuC+14*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+15*4+0:vgprValuC+15*4+1:vgprValuC+15*4+2:vgprValuC+15*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+7*2+0:vgprValuB_X1_I0+7*2+1] v[vgprValuC+15*4+0: vgprValuC+15*4+1: vgprValuC+15*4+2: vgprValuC+15*4+3] // + +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 128 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 10240 +.set LDS_BLK_OFFSET_64Kmasked, 10240 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x7800 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprGlobalReadOffsetB+1] +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprGlobalReadOffsetB+0] + + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] +v_mov_b32 v[vgprGlobalReadOffsetB1+2], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB1+3], s[sgprSizesSum], v[vgprGlobalReadOffsetB+3] + + + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + + +v_add_u32 v[vgprGlobalReadOffsetB+2], 64, v[vgprTemp0+0] +v_mul_lo_u32 v[vgprGlobalReadOffsetB+3], s[sgprStrideStruct], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1], v[vgprGlobalReadOffsetB+3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+3], 0x1, v[vgprGlobalReadOffsetB+3] + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + +v_mov_b32 v[vgprGlobalReadOffsetB1+2], v[vgprGlobalReadOffsetB+2] +v_add_u32 v[vgprGlobalReadOffsetB1+3], s[sgprSizesSum], v[vgprGlobalReadOffsetB+3] + + + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], s[sgprTemp0] + + + + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B*4 +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+2:vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB1], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB1:vgprGlobalReadOffsetB1+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds +s_add_u32 m0, m0, WAVE_LDS_OFFSET_B*4 +buffer_load_dwordx4 v[vgprGlobalReadOffsetB1+2:vgprGlobalReadOffsetB1+3], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf + +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 + +ds_read_b64 v[vgprValuB_X0_I0+ 8:vgprValuB_X0_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4096 +ds_read_b64 v[vgprValuB_X0_I0+ 10:vgprValuB_X0_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4352 +ds_read_b64 v[vgprValuB_X0_I0+ 12:vgprValuB_X0_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4608 +ds_read_b64 v[vgprValuB_X0_I0+ 14:vgprValuB_X0_I0+ 15], v[vgprLocalReadAddrB+1] offset:\off+4096 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 + +ds_read_b64 v[vgprValuB_X1_I0+ 8:vgprValuB_X1_I0+ 9], v[vgprLocalReadAddrB] offset:\off+4128 +ds_read_b64 v[vgprValuB_X1_I0+ 10:vgprValuB_X1_I0+ 11], v[vgprLocalReadAddrB] offset:\off+4384 +ds_read_b64 v[vgprValuB_X1_I0+ 12:vgprValuB_X1_I0+ 13], v[vgprLocalReadAddrB] offset:\off+4640 +ds_read_b64 v[vgprValuB_X1_I0+ 14:vgprValuB_X1_I0+ 15], v[vgprLocalReadAddrB+2] offset:\off+4096 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 3, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB1+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + + + + +s_cmp_lt_i32 s[sgprTemp3], 1 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(6) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x6 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 +v_mov_b32 v[vgprValuC+32], 0x0 +v_mov_b32 v[vgprValuC+33], 0x0 +v_mov_b32 v[vgprValuC+34], 0x0 +v_mov_b32 v[vgprValuC+35], 0x0 +v_mov_b32 v[vgprValuC+36], 0x0 +v_mov_b32 v[vgprValuC+37], 0x0 +v_mov_b32 v[vgprValuC+38], 0x0 +v_mov_b32 v[vgprValuC+39], 0x0 +v_mov_b32 v[vgprValuC+40], 0x0 +v_mov_b32 v[vgprValuC+41], 0x0 +v_mov_b32 v[vgprValuC+42], 0x0 +v_mov_b32 v[vgprValuC+43], 0x0 +v_mov_b32 v[vgprValuC+44], 0x0 +v_mov_b32 v[vgprValuC+45], 0x0 +v_mov_b32 v[vgprValuC+46], 0x0 +v_mov_b32 v[vgprValuC+47], 0x0 +v_mov_b32 v[vgprValuC+48], 0x0 +v_mov_b32 v[vgprValuC+49], 0x0 +v_mov_b32 v[vgprValuC+50], 0x0 +v_mov_b32 v[vgprValuC+51], 0x0 +v_mov_b32 v[vgprValuC+52], 0x0 +v_mov_b32 v[vgprValuC+53], 0x0 +v_mov_b32 v[vgprValuC+54], 0x0 +v_mov_b32 v[vgprValuC+55], 0x0 +v_mov_b32 v[vgprValuC+56], 0x0 +v_mov_b32 v[vgprValuC+57], 0x0 +v_mov_b32 v[vgprValuC+58], 0x0 +v_mov_b32 v[vgprValuC+59], 0x0 +v_mov_b32 v[vgprValuC+60], 0x0 +v_mov_b32 v[vgprValuC+61], 0x0 +v_mov_b32 v[vgprValuC+62], 0x0 +v_mov_b32 v[vgprValuC+63], 0x0 + +GLOBAL_INC_Scale_Zero +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 + +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +WaveID_gecase: + + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW4_7: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + + + +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(12) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW4_7 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[32] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] + +s_mul_i32 s[sgprTemp2], 8192, s[sgprWaveID] +v_add_u32 v[vgprLocalWriteC], s[sgprTemp2], v[vgprLocalWriteC] + +s_barrier + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3 +v_add_u32 v[vgprLocalReadC], 0x8000, v[vgprLocalWriteC] +Skip_Wave0_3: + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7 +s_mov_b32 s[sgprTemp2], 0x8000 +v_sub_u32 v[vgprLocalReadC], v[vgprLocalWriteC], s[sgprTemp2] +Skip_Wave4_7: + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3_W +ds_write_b32 v[vgprLocalWriteC], v[32], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[36], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[33], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[37], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[34], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[38], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[35], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[39], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[40], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[44], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[41], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[45], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[42], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[46], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[43], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[47], offset:3840 + +ds_write_b32 v[vgprLocalWriteC], v[48], offset:4096 +ds_write_b32 v[vgprLocalWriteC], v[52], offset:4352 +ds_write_b32 v[vgprLocalWriteC], v[49], offset:4608 +ds_write_b32 v[vgprLocalWriteC], v[53], offset:4864 +ds_write_b32 v[vgprLocalWriteC], v[50], offset:5120 +ds_write_b32 v[vgprLocalWriteC], v[54], offset:5376 +ds_write_b32 v[vgprLocalWriteC], v[51], offset:5632 +ds_write_b32 v[vgprLocalWriteC], v[55], offset:5888 + +ds_write_b32 v[vgprLocalWriteC], v[56], offset:6144 +ds_write_b32 v[vgprLocalWriteC], v[60], offset:6400 +ds_write_b32 v[vgprLocalWriteC], v[57], offset:6656 +ds_write_b32 v[vgprLocalWriteC], v[61], offset:6912 +ds_write_b32 v[vgprLocalWriteC], v[58], offset:7168 +ds_write_b32 v[vgprLocalWriteC], v[62], offset:7424 +ds_write_b32 v[vgprLocalWriteC], v[59], offset:7680 +ds_write_b32 v[vgprLocalWriteC], v[63], offset:7936 + +s_waitcnt lgkmcnt(0) +s_barrier +ds_read_b32 v[vgprTmpValC+32], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+36], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+33], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+37], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+34], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+38], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+35], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+39], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+40], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+44], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+41], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+45], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+42], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+46], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+43], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+47], v[vgprLocalReadC+0] offset:3840 + + +ds_read_b32 v[vgprTmpValC+48], v[vgprLocalReadC+0] offset:4096 +ds_read_b32 v[vgprTmpValC+52], v[vgprLocalReadC+0] offset:4352 +ds_read_b32 v[vgprTmpValC+49], v[vgprLocalReadC+0] offset:4608 +ds_read_b32 v[vgprTmpValC+53], v[vgprLocalReadC+0] offset:4864 +ds_read_b32 v[vgprTmpValC+50], v[vgprLocalReadC+0] offset:5120 +ds_read_b32 v[vgprTmpValC+54], v[vgprLocalReadC+0] offset:5376 +ds_read_b32 v[vgprTmpValC+51], v[vgprLocalReadC+0] offset:5632 +ds_read_b32 v[vgprTmpValC+55], v[vgprLocalReadC+0] offset:5888 +ds_read_b32 v[vgprTmpValC+56], v[vgprLocalReadC+0] offset:6144 +ds_read_b32 v[vgprTmpValC+60], v[vgprLocalReadC+0] offset:6400 +ds_read_b32 v[vgprTmpValC+57], v[vgprLocalReadC+0] offset:6656 +ds_read_b32 v[vgprTmpValC+61], v[vgprLocalReadC+0] offset:6912 +ds_read_b32 v[vgprTmpValC+58], v[vgprLocalReadC+0] offset:7168 +ds_read_b32 v[vgprTmpValC+62], v[vgprLocalReadC+0] offset:7424 +ds_read_b32 v[vgprTmpValC+59], v[vgprLocalReadC+0] offset:7680 +ds_read_b32 v[vgprTmpValC+63], v[vgprLocalReadC+0] offset:7936 + +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+32] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+36] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+33] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+37] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+34] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+38] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+35] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+39] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+40] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+44] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+41] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+45] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+42] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+46] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+43] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+47] + +v_add_f32 v[vgprTmpValC+16], v[vgprTmpValC+16], v[vgprTmpValC+48] +v_add_f32 v[vgprTmpValC+20], v[vgprTmpValC+20], v[vgprTmpValC+52] +v_add_f32 v[vgprTmpValC+17], v[vgprTmpValC+17], v[vgprTmpValC+49] +v_add_f32 v[vgprTmpValC+21], v[vgprTmpValC+21], v[vgprTmpValC+53] +v_add_f32 v[vgprTmpValC+18], v[vgprTmpValC+18], v[vgprTmpValC+50] +v_add_f32 v[vgprTmpValC+22], v[vgprTmpValC+22], v[vgprTmpValC+54] +v_add_f32 v[vgprTmpValC+19], v[vgprTmpValC+19], v[vgprTmpValC+51] +v_add_f32 v[vgprTmpValC+23], v[vgprTmpValC+23], v[vgprTmpValC+55] + +v_add_f32 v[vgprTmpValC+24], v[vgprTmpValC+24], v[vgprTmpValC+56] +v_add_f32 v[vgprTmpValC+28], v[vgprTmpValC+28], v[vgprTmpValC+60] +v_add_f32 v[vgprTmpValC+25], v[vgprTmpValC+25], v[vgprTmpValC+57] +v_add_f32 v[vgprTmpValC+29], v[vgprTmpValC+29], v[vgprTmpValC+61] +v_add_f32 v[vgprTmpValC+26], v[vgprTmpValC+26], v[vgprTmpValC+58] +v_add_f32 v[vgprTmpValC+30], v[vgprTmpValC+30], v[vgprTmpValC+62] +v_add_f32 v[vgprTmpValC+27], v[vgprTmpValC+27], v[vgprTmpValC+59] +v_add_f32 v[vgprTmpValC+31], v[vgprTmpValC+31], v[vgprTmpValC+63] +Skip_Wave0_3_W: + + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7_W + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +ds_write_b32 v[vgprLocalWriteC], v[16], offset:4096 +ds_write_b32 v[vgprLocalWriteC], v[20], offset:4352 +ds_write_b32 v[vgprLocalWriteC], v[17], offset:4608 +ds_write_b32 v[vgprLocalWriteC], v[21], offset:4864 +ds_write_b32 v[vgprLocalWriteC], v[18], offset:5120 +ds_write_b32 v[vgprLocalWriteC], v[22], offset:5376 +ds_write_b32 v[vgprLocalWriteC], v[19], offset:5632 +ds_write_b32 v[vgprLocalWriteC], v[23], offset:5888 + +ds_write_b32 v[vgprLocalWriteC], v[24], offset:6144 +ds_write_b32 v[vgprLocalWriteC], v[28], offset:6400 +ds_write_b32 v[vgprLocalWriteC], v[25], offset:6656 +ds_write_b32 v[vgprLocalWriteC], v[29], offset:6912 +ds_write_b32 v[vgprLocalWriteC], v[26], offset:7168 +ds_write_b32 v[vgprLocalWriteC], v[30], offset:7424 +ds_write_b32 v[vgprLocalWriteC], v[27], offset:7680 +ds_write_b32 v[vgprLocalWriteC], v[31], offset:7936 + +s_waitcnt lgkmcnt(0) +s_barrier + + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+0] offset:3840 + +ds_read_b32 v[vgprTmpValC+16], v[vgprLocalReadC+0] offset:4096 +ds_read_b32 v[vgprTmpValC+20], v[vgprLocalReadC+0] offset:4352 +ds_read_b32 v[vgprTmpValC+17], v[vgprLocalReadC+0] offset:4608 +ds_read_b32 v[vgprTmpValC+21], v[vgprLocalReadC+0] offset:4864 +ds_read_b32 v[vgprTmpValC+18], v[vgprLocalReadC+0] offset:5120 +ds_read_b32 v[vgprTmpValC+22], v[vgprLocalReadC+0] offset:5376 +ds_read_b32 v[vgprTmpValC+19], v[vgprLocalReadC+0] offset:5632 +ds_read_b32 v[vgprTmpValC+23], v[vgprLocalReadC+0] offset:5888 +ds_read_b32 v[vgprTmpValC+24], v[vgprLocalReadC+0] offset:6144 +ds_read_b32 v[vgprTmpValC+28], v[vgprLocalReadC+0] offset:6400 +ds_read_b32 v[vgprTmpValC+25], v[vgprLocalReadC+0] offset:6656 +ds_read_b32 v[vgprTmpValC+29], v[vgprLocalReadC+0] offset:6912 +ds_read_b32 v[vgprTmpValC+26], v[vgprLocalReadC+0] offset:7168 +ds_read_b32 v[vgprTmpValC+30], v[vgprLocalReadC+0] offset:7424 +ds_read_b32 v[vgprTmpValC+27], v[vgprLocalReadC+0] offset:7680 +ds_read_b32 v[vgprTmpValC+31], v[vgprLocalReadC+0] offset:7936 + +s_waitcnt lgkmcnt(0) + +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+32] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+36] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+33] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+37] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+34] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+38] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+35] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+39] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+40] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+44] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+41] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+45] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+42] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+46] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+43] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+47] + +v_add_f32 v[vgprTmpValC+16], v[vgprTmpValC+16], v[vgprTmpValC+48] +v_add_f32 v[vgprTmpValC+20], v[vgprTmpValC+20], v[vgprTmpValC+52] +v_add_f32 v[vgprTmpValC+17], v[vgprTmpValC+17], v[vgprTmpValC+49] +v_add_f32 v[vgprTmpValC+21], v[vgprTmpValC+21], v[vgprTmpValC+53] +v_add_f32 v[vgprTmpValC+18], v[vgprTmpValC+18], v[vgprTmpValC+50] +v_add_f32 v[vgprTmpValC+22], v[vgprTmpValC+22], v[vgprTmpValC+54] +v_add_f32 v[vgprTmpValC+19], v[vgprTmpValC+19], v[vgprTmpValC+51] +v_add_f32 v[vgprTmpValC+23], v[vgprTmpValC+23], v[vgprTmpValC+55] + +v_add_f32 v[vgprTmpValC+24], v[vgprTmpValC+24], v[vgprTmpValC+56] +v_add_f32 v[vgprTmpValC+28], v[vgprTmpValC+28], v[vgprTmpValC+60] +v_add_f32 v[vgprTmpValC+25], v[vgprTmpValC+25], v[vgprTmpValC+57] +v_add_f32 v[vgprTmpValC+29], v[vgprTmpValC+29], v[vgprTmpValC+61] +v_add_f32 v[vgprTmpValC+26], v[vgprTmpValC+26], v[vgprTmpValC+58] +v_add_f32 v[vgprTmpValC+30], v[vgprTmpValC+30], v[vgprTmpValC+62] +v_add_f32 v[vgprTmpValC+27], v[vgprTmpValC+27], v[vgprTmpValC+59] +v_add_f32 v[vgprTmpValC+31], v[vgprTmpValC+31], v[vgprTmpValC+63] + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprLocalWriteC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + +Skip_Wave4_7_W: + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_dequant.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_dequant.s new file mode 100644 index 0000000000000000000000000000000000000000..c29c1085ea1ecbbfd1eaab2673bedfeabdf734ca --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_dequant.s @@ -0,0 +1,1358 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32_dq +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32_dq +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32_dq,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32_dq + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32_dq + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32_dq.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32_dq: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprGlobalWriteD_Edge, 230 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 212 +.set vgprGlobalReadOffsetB, 214 +.set vgprGlobalReadOffsetB1, 218 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 32 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 4096 +.set LDS_BLK_OFFSET_64Kmasked, 4096 +.set LOG2BPE, 1 +//.set BPE, 2 +.set BPE, 1 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +//s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x4000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWaveID] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprStridesA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +//s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +//s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mov_b32 s[sgprTemp1], s[sgprStridesA] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +s_lshr_b32 s[sgprTemp1], s[sgprTemp1], 1 +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 2*MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +//s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG + +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 + +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_and_b32 s[sgprTemp0], s[sgprStridesA], 15 +s_sub_u32 s[sgprTemp1], 16, s[sgprTemp0] +s_add_u32 s[sgprTemp1], s[sgprShadowLimitA+0], s[sgprTemp1] +s_cmp_gt_u32 s[sgprTemp0], 0 +s_cmov_b32 s[sgprShadowLimitA+0], s[sgprTemp1] + +//s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +//s_cbranch_scc1 label_SkipMmac + +//s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 1 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 7 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], 8*MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8*MT0 // WorkGroup[01] * MT + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[\vgprPack+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[\vgprPack+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[\vgprPack+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[\vgprPack+3], v[vgprValuA_X0_H0+3] + +v_cvt_f16_f32 v[\vgprPack+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[\vgprPack+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[\vgprPack+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[\vgprPack+7], v[vgprValuA_X0_H0+7] +.endm + + + +.macro WriteFp16ToGlobal vgprOut:req +buffer_store_short v[\vgprOut+0], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+1], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +buffer_store_short v[\vgprOut+2], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+3], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +buffer_store_short v[\vgprOut+4], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+5], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +buffer_store_short v[\vgprOut+6], v[vgprGlobalWriteOffsetD+0], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 +buffer_store_short v[\vgprOut+7], v[vgprGlobalWriteOffsetD+1], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 + +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*13 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +.endm + + + + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 4, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +s_cmp_lt_i32 s[sgprTemp3], 2 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(4) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_waitcnt vmcnt(2) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +//s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] + +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesSum] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +//s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x4 + +s_lshr_b32 s[sgprTemp1], s[sgprSizesSum], 0x1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +v_add_u32 v[vgprGlobalWriteOffsetD+1], v[vgprGlobalWriteOffsetD], 2 + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero + +GLOBAL_INC_Scale_Zero + + +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 + +v_and_b32 v[vgprGlobalWriteD_Edge+2], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalWriteD_Edge+2], 1, v[vgprGlobalWriteD_Edge+2] +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprGlobalWriteD_Edge+2], v[vgprGlobalWriteD_Edge+2], s[sgprTemp1] +v_lshlrev_b32 v[vgprGlobalWriteD_Edge+2], 1, v[vgprGlobalWriteD_Edge+2] +//v_mov_b32 v[vgprGlobalWriteD_Edge+3], v[vgprGlobalWriteD_Edge+2] //store inittial addr +//v_mov_b32 v[vgprGlobalWriteD_Edge+0], v[vgprGlobalWriteOffsetD] //store inittial addr + +//v_mov_b32 v[vgprGlobalWriteD_Edge+1], v[vgprGlobalWriteD_Edge+0] +//v_mov_b32 v[vgprGlobalWriteD_Edge+2], v[vgprGlobalWriteD_Edge+3] +//v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprGlobalWriteD_Edge+2], s[sgprD_MEdge] +v_cmp_gt_u32 s[sgprTemp2:sgprTemp2+1], v[vgprGlobalWriteD_Edge+2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], -1, s[sgprTemp2:sgprTemp2+1] +v_cndmask_b32 v[vgprGlobalWriteOffsetD+1], v[vgprGlobalWriteOffsetD+1], -1, s[sgprTemp2:sgprTemp2+1] + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 + +.if debug_buffer + +v_mov_b32 v[vgprDebugTmp], v[vgprValuScales+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuZeros+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + + + +.if debug_buffer + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + +//v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrA+0] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrA+1] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD+0] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +//v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD+1] +//flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +//v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + + + +.endif + + + + + +WriteFp16ToGlobal vgprValuA_X2_I0 + +//s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +//s_waitcnt vmcnt(0) +WriteFp16ToGlobal vgprValuA_X3_I0 + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +WriteFp16ToGlobal vgprValuA_X2_I0 +s_waitcnt vmcnt(8) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +WriteFp16ToGlobal vgprValuA_X2_I0 +//s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +WriteFp16ToGlobal vgprValuA_X2_I0 +s_waitcnt vmcnt(8) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdD+3] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + + +WriteFp16ToGlobal vgprValuA_X2_I0 +s_waitcnt vmcnt(0) +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +WriteFp16ToGlobal vgprValuA_X3_I0 + + +s_endpgm \ No newline at end of file diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..741bf2d6395ebea79c2cfe8f36d6c79c2913afb7 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,1688 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 24576 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 24576 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 +.set vgprValuB_X0_I0, 244 +.set vgprValuB_X1_I0, 248 +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x32_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x32_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 32 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 4096 +.set LDS_BLK_OFFSET_64Kmasked, 4096 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 32 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x6000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 5, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 31, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 7, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +/* +v_and_b32 v3, 1, v1 +v_mul_u32_u24 v3, 0x40, v3 +v_mul_u32_u24 v2, WAVE_LDS_OFFSET_A+0, v1 +v_add_u32 v2, v3, v2 +*/ +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x210 // L1477 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+1], 0x100, v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+2], 0x120, v[vgprLocalReadAddrB] + +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 9, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +//ds_read_m32x16_b16 v[vgprValuA_X0_I0+ 0:vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+0] offset:\off + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +//ds_read_m32x16_b16 v[vgprValuA_X1_I0+ 0:vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(6) +s_barrier +Last3: +s_waitcnt vmcnt(4) +s_barrier +Last2: +s_waitcnt vmcnt(2) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 + +GLOBAL_INC_Scale_Zero +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X1_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X1_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrA+4] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + + +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x32_1 + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mov_b32 s[sgprTemp1], s[sgprWaveID] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s new file mode 100644 index 0000000000000000000000000000000000000000..fef8034b19e593986df61e7cea91bcd9ffb7f106 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s @@ -0,0 +1,1990 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x32x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 212 +.set vgprGlobalReadOffsetB, 214 +.set vgprGlobalReadOffsetB1, 218 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 32 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 4096 +.set LDS_BLK_OFFSET_64Kmasked, 4096 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x4000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 5, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 31, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 7, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element + + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*32 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x210 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+1], 0x100, v[vgprLocalReadAddrB] +v_add_u32 v[vgprLocalReadAddrB+2], 0x120, v[vgprLocalReadAddrB] + +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 9, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 512 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v4, 512, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 512 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], s[sgprTemp0] + + + + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetB1:vgprGlobalReadOffsetB1+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 4, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB1+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + + + + +s_cmp_lt_i32 s[sgprTemp3], 2 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_waitcnt vmcnt(4) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x4 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 + +GLOBAL_INC_Scale_Zero +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +WaveID_gecase: + + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW4_7: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCntCommon] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(6) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW4_7 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] + +s_mul_i32 s[sgprTemp2], 8192, s[sgprWaveID] +v_add_u32 v[vgprLocalWriteC], s[sgprTemp2], v[vgprLocalWriteC] +s_barrier + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3 +v_add_u32 v[vgprLocalReadC], 0x8000, v[vgprLocalWriteC] +Skip_Wave0_3: + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7 +s_mov_b32 s[sgprTemp2], 0x8000 +v_sub_u32 v[vgprLocalReadC], v[vgprLocalWriteC], s[sgprTemp2] +Skip_Wave4_7: + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3_W + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:1792 + +s_waitcnt lgkmcnt(0) +s_barrier +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+0] offset:1792 + +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+8] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+12] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+11] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+15] +Skip_Wave0_3_W: + + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7_W + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +s_waitcnt lgkmcnt(0) +s_barrier + + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+0] offset:1792 +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+8] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+12] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+9] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+13] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+10] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+14] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+11] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+15] + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprLocalWriteC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + +Skip_Wave4_7_W: + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +/* +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 +*/ +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s new file mode 100644 index 0000000000000000000000000000000000000000..785d078d4cfaaca712b007d9d83147d2511d166a --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16.s @@ -0,0 +1,2055 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 36864 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 36864 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetB, 214 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +//.set sgprWaveID, 74 +.set sgprWaveID, 64 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + + +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 64 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 6144 +.set LDS_BLK_OFFSET_64Kmasked, 6144 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x9000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] +v_mov_b32 v[vgprGlobalReadOffsetB+3], v[vgprTemp1] +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +/* +v_and_b32 v3, 1, v1 +v_mul_u32_u24 v3, 0x40, v3 +v_mul_u32_u24 v2, WAVE_LDS_OFFSET_A+0, v1 +v_add_u32 v2, v3, v2 +*/ +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +.macro I4ToFp16_old vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 28, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+6], 24, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 20, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+4], 16, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 12, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+2], 8, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_lshrrev_b32 v[vgprValuA_X0_H0+0], 0, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0], 0xf +v_and_b32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1], 0xf +v_and_b32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2], 0xf +v_and_b32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3], 0xf +v_and_b32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4], 0xf +v_and_b32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5], 0xf +v_and_b32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6], 0xf +v_and_b32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7], 0xf + +I32ToF16 vgprValuA_X0_H0+0 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+1 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+2 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+3 \vgprZero+3 \vgprScale+3 +I32ToF16 vgprValuA_X0_H0+4 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+5 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+6 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+7 \vgprZero+3 \vgprScale+3 + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+4] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+5] +v_pack_b32_f16 v[\vgprPack+4], v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+6], v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+7] + +v_lshrrev_b32 v[vgprValuA_X0_H0+15], 28, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+14], 24, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+13], 20, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+12], 16, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+11], 12, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+10], 8, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+9], 4, v[\vgprIn+1] +v_lshrrev_b32 v[vgprValuA_X0_H0+8], 0, v[\vgprIn+1] + +v_and_b32 v[vgprValuA_X0_H0+8], v[vgprValuA_X0_H0+8], 0xf +v_and_b32 v[vgprValuA_X0_H0+9], v[vgprValuA_X0_H0+9], 0xf +v_and_b32 v[vgprValuA_X0_H0+10], v[vgprValuA_X0_H0+10], 0xf +v_and_b32 v[vgprValuA_X0_H0+11], v[vgprValuA_X0_H0+11], 0xf +v_and_b32 v[vgprValuA_X0_H0+12], v[vgprValuA_X0_H0+12], 0xf +v_and_b32 v[vgprValuA_X0_H0+13], v[vgprValuA_X0_H0+13], 0xf +v_and_b32 v[vgprValuA_X0_H0+14], v[vgprValuA_X0_H0+14], 0xf +v_and_b32 v[vgprValuA_X0_H0+15], v[vgprValuA_X0_H0+15], 0xf + +I32ToF16 vgprValuA_X0_H0+8 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+9 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+10 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+11 \vgprZero+3 \vgprScale+3 +I32ToF16 vgprValuA_X0_H0+12 \vgprZero+0 \vgprScale+0 +I32ToF16 vgprValuA_X0_H0+13 \vgprZero+1 \vgprScale+1 +I32ToF16 vgprValuA_X0_H0+14 \vgprZero+2 \vgprScale+2 +I32ToF16 vgprValuA_X0_H0+15 \vgprZero+3 \vgprScale+3 + +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+8], v[vgprValuA_X0_H0+12] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+9], v[vgprValuA_X0_H0+13] +v_pack_b32_f16 v[\vgprPack+5], v[vgprValuA_X0_H0+10], v[vgprValuA_X0_H0+14] +v_pack_b32_f16 v[\vgprPack+7], v[vgprValuA_X0_H0+11], v[vgprValuA_X0_H0+15] +.endm + + + + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_min_u32 s[sgprLoopCntCommon], 6, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_cmp_lt_i32 s[sgprTemp3], 4 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_cmp_eq_i32 s[sgprTemp3], 2 +s_cbranch_scc1 Last2 +s_cmp_eq_i32 s[sgprTemp3], 3 +s_cbranch_scc1 Last3 +s_waitcnt vmcnt(6) +s_barrier +Last3: +s_waitcnt vmcnt(4) +s_barrier +Last2: +s_waitcnt vmcnt(2) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp0], s[sgprWaveID], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 + +GLOBAL_INC_Scale_Zero +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuZeros+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetZero+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X0_I0+2] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadAddrA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + +.endif + + + + +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuZerosI32+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuScalesF32+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X2_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuB_X1_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuA_X3_I0+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + + +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*4 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*5 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP +s_branch WaveID_EndSwitch +WaveID_gecase: + +WaveID_EndSwitch: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mov_b32 s[sgprTemp1], s[sgprWaveID] +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s new file mode 100644 index 0000000000000000000000000000000000000000..4d4dc68d22e50388dcb52e7f0240731ff85853f8 --- /dev/null +++ b/asm_kernels/Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT4_2_w4a16_splitK.s @@ -0,0 +1,2301 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.globl Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 +.p2align 8 +.type Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 256 // vgprs + .amdhsa_next_free_sgpr 100 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =256 */ +/* Num AccVGPR=0 */ +/* Num SGPR =100 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 4 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3 + .symbol: 'Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 768 + .private_segment_fixed_size: 0 + .sgpr_count: 100 + .sgpr_spill_count: 0 + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Ailk_Bljk_HHS_BH_UserArgs_MT64x64x32_SN_K1_PGR6_SB1_TT2_2_WG16_16_3: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +/* +.set vgprValuA_X0_I0, 16 +.set vgprValuA_X1_I0, 20 +.set vgprValuB_X0_I0, 24 +.set vgprValuB_X1_I0, 28 +*/ +.set vgprValuB_X0_I0, 32 +.set vgprValuB_X1_I0, 40 + +.set vgprValuA_X0_I0, 236 +.set vgprValuA_X1_I0, 240 + +.set vgprValuA_X0_H0, 64 +.set vgprValuA_X1_H0, 96 +.set vgprValuA_X2_I0, 128 +.set vgprValuA_X3_I0, 144 +.set vgprValuZeros, 160 +.set vgprValuZerosI32, 164 +.set vgprValuScales, 176 +.set vgprValuScalesF32, 180 +.set vgprGlobalReadOffsetScale, 196 +.set vgprGlobalReadOffsetZero, 198 + +//user define +.set vgprGLA, 230 +.set vgprGLB, 188 +.set vgprLocalWriteA, 196 +.set vgprLocalWriteB, 198 + +.set vgprLocalWriteC, 196 +.set vgprLocalReadC, 230 +.set vgprTmpValC, 0 + +.set vgprAddressDbg, 200 //debugbuffer +.set vgprDebugTmp, 202 //debugbuffer +.set vgprSerial, 203 +.set vgprTemp0, 204 +.set vgprTemp1, 205 +.set vgprTemp2, 206 +.set vgprTemp3, 207 +.set vgprGlobalWriteOffsetD,208 +.set vgprGlobalReadOffsetA, 210 +.set vgprGlobalReadOffsetA1, 212 +.set vgprGlobalReadOffsetB, 214 +.set vgprGlobalReadOffsetB1, 218 +.set vgprLocalReadAddrA, 220 +.set vgprLocalReadAddrB, 228 +.set vgprLocalReadAddrB_ori, 228 +.set vgprKeepSgprValue, 255 + + + +.set BufferLimit, 0 //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 3 +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 30 +.set sgprGlobalReadIncsA, 62 +.set sgprGlobalReadIncsB, 63 + +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprLDSMask, 75 +//.set sgprLoopforPfIter, 76 +//.set sgprLDSWriteIter, 78 + +.set sgprLocalWriteAddrA1, 76 +.set sgprLocalWriteAddrB1, 77 +.set sgprLocalWriteAddrA1ori, 78 +.set sgprLocalWriteAddrB1ori, 79 + + + +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 84 +.set sgprWaveID, 88 +.set sgprLDSMask, 89 + +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStruct, 96 +.set sgprStructBit, 97 +.set sgprStructNum, 98 + +.set sgprLocalWriteAddrScale, 74 +.set sgprLocalWriteAddrZero, 75 +.set sgprZeroAddress, 68 +.set sgprScaleAddress, 70 +.set sgprZero, 64 +.set sgprScale, 20 + + + +.macro MMAC_32x64_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+0*2+0:vgprValuB_X0_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+1*2+0:vgprValuB_X0_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+2*2+0:vgprValuB_X0_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X2_I0+0*2+0:vgprValuA_X2_I0+0*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X2_I0+1*2+0:vgprValuA_X2_I0+1*2+1] v[vgprValuB_X0_I0+3*2+0:vgprValuB_X0_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // + + +s_setprio 0 +.endm + +.macro MMAC_32x64_1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0*4+0:vgprValuC+0*4+1:vgprValuC+0*4+2:vgprValuC+0*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+0*4+0: vgprValuC+0*4+1: vgprValuC+0*4+2: vgprValuC+0*4+3] // +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+1*4+0:vgprValuC+1*4+1:vgprValuC+1*4+2:vgprValuC+1*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+0*2+0:vgprValuB_X1_I0+0*2+1] v[vgprValuC+1*4+0: vgprValuC+1*4+1: vgprValuC+1*4+2: vgprValuC+1*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+2*4+0:vgprValuC+2*4+1:vgprValuC+2*4+2:vgprValuC+2*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+2*4+0: vgprValuC+2*4+1: vgprValuC+2*4+2: vgprValuC+2*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+3*4+0:vgprValuC+3*4+1:vgprValuC+3*4+2:vgprValuC+3*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+1*2+0:vgprValuB_X1_I0+1*2+1] v[vgprValuC+3*4+0: vgprValuC+3*4+1: vgprValuC+3*4+2: vgprValuC+3*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+4*4+0:vgprValuC+4*4+1:vgprValuC+4*4+2:vgprValuC+4*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+4*4+0: vgprValuC+4*4+1: vgprValuC+4*4+2: vgprValuC+4*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+5*4+0:vgprValuC+5*4+1:vgprValuC+5*4+2:vgprValuC+5*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+2*2+0:vgprValuB_X1_I0+2*2+1] v[vgprValuC+5*4+0: vgprValuC+5*4+1: vgprValuC+5*4+2: vgprValuC+5*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+6*4+0:vgprValuC+6*4+1:vgprValuC+6*4+2:vgprValuC+6*4+3] v[vgprValuA_X3_I0+0*2+0:vgprValuA_X3_I0+0*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+6*4+0: vgprValuC+6*4+1: vgprValuC+6*4+2: vgprValuC+6*4+3] // +v_mmac_f32_16x16x16_f16 v[vgprValuC+7*4+0:vgprValuC+7*4+1:vgprValuC+7*4+2:vgprValuC+7*4+3] v[vgprValuA_X3_I0+1*2+0:vgprValuA_X3_I0+1*2+1] v[vgprValuB_X1_I0+3*2+0:vgprValuB_X1_I0+3*2+1] v[vgprValuC+7*4+0: vgprValuC+7*4+1: vgprValuC+7*4+2: vgprValuC+7*4+3] // +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 64 + +.set LDS_B_OFFSET, 2048 +.set LDS_BLK_OFFSET, 6144 +.set LDS_BLK_OFFSET_64Kmasked, 6144 +.set LOG2BPE, 1 +.set BPE, 2 +.set DEPTHU, 32 +.set LOG2DEPTHU, 5 +.set PFTLOOPS, 6 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 32 +.set NperWAVE, 64 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_load_dwordx2 s[sgprZeroAddress:sgprZeroAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60 +s_load_dwordx2 s[sgprScaleAddress:sgprScaleAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x68 +s_waitcnt lgkmcnt(0) +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 8 +//s_load_dwordx2 s[sgprTemp2:sgprTemp3], s[sgprWGMBuffer:sgprWGMBuffer+1], s[sgprTemp0] +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] +// A uint8 to fp16 mcc +s_lshr_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 1 +s_lshr_b32 s[sgprStridesA], s[sgprStridesA], 1 + +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_mov_b32 s[sgprLDSMask], 0x6000 + + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + + +.set debug_buffer, 0 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v1, LOG2_COALESCE_THREAD_A, v[vgprTemp0] +v_and_b32 v0, COALESCE_THREAD_A-1, v[vgprTemp0] +v_mul_lo_u32 v0, 8, v0 +s_mul_i32 s[sgprTemp0], s[sgprStridesA], GLWAVES*BPE +v_mul_lo_u32 v1, v1, s[sgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetA], v0, v1 +s_mul_i32 s[sgprTemp1], s[sgprStridesA], BPE +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprGlWaveID] +v_add_u32 v[vgprGlobalReadOffsetA], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + +//s_lshr_b32 s[sgprTemp1], s[sgprSizesSum+0], 1 // +s_mul_i32 s[sgprTemp1], s[sgprSizesSum+0], s[sgprStridesA] // notice +v_add_u32 v[vgprGlobalReadOffsetA1], v[vgprGlobalReadOffsetA], s[sgprTemp1] + + + + + + + +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesSum] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesA+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp3] +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressA:sgprAddressA+1], 0 // s[sgprAddressA] == 0 ? +s_cbranch_scc1 label_SkipMmac + + +s_and_b32 s[sgprTemp1], s[sgprSizesFree+0], 1 // +s_cmp_eq_u32 s[sgprTemp1], 0 // +s_cbranch_scc1 label_skiPadA +s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 2 // extend limit for pre-pad +s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +label_skiPadA: + +s_mul_i32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s[sgprStridesA] //depthU*PEB + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + +v_and_b32 v[vgprTemp1], 255, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp1] +v_lshlrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_and_b32 v[vgprTemp0], 63, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp0], 4, v[vgprTemp0] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 15, v[vgprSerial] +v_lshrrev_b32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp1], 3, v[vgprSerial] +v_lshlrev_b32 v[vgprTemp1], 3, v[vgprTemp1] +s_lshr_b32 s[sgprTemp1], s[sgprStridesB], 0xd +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // L624 +s_sub_u32 s[sgprTemp0], s[sgprSizesFree+1], s[sgprTemp0] +s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 +v_mov_b32 v[vgprTemp3], s[sgprTemp0] // +v_min_i32 v[vgprTemp0], v[vgprTemp0], v[vgprTemp3] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+0], s[sgprTemp1], v[vgprTemp0] // +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprTemp1] // offset *= bytes/element + + + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + + +s_lshr_b32 s[sgprStrideStruct], s[sgprStridesB], 0xd +s_cmp_gt_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprStructNum], 512 +s_cmov_b32 s[sgprStructBit], 26 + +s_cmp_gt_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprStructNum], 1024 +s_cmov_b32 s[sgprStructBit], 27 + +s_cmp_gt_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprStructNum], 2048 +s_cmov_b32 s[sgprStructBit], 28 + +s_cmp_gt_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprStructNum], 4096 +s_cmov_b32 s[sgprStructBit], 29 + +s_mov_b32 s[sgprTemp7], 0 +s_cmp_eq_u32 s[sgprSizesSum+0], 4096 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 2048 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 1024 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprSizesSum+0], 512 +s_cmov_b32 s[sgprTemp7], 1 +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip_K_Sub2048 +s_sub_u32 s[sgprStrideStruct], s[sgprStridesB], s[sgprStructNum] +Skip_K_Sub2048: +Skip_K4096: + +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cbranch_scc1 Skip2_K4096 +s_cmp_le_u32 s[sgprSizesSum+0], 512 +s_cbranch_scc1 Skip2_K_Sub2048 + +v_mul_lo_u32 v[vgprTemp2], s[sgprStrideStruct], v[vgprTemp0] +v_mov_b32 v[vgprGlobalReadOffsetB+0], v[vgprTemp0] +v_add_u32 v[vgprGlobalReadOffsetB+1], v[vgprTemp1], v[vgprTemp2] +v_lshlrev_b32 v[vgprGlobalReadOffsetB+1], 0x1, v[vgprGlobalReadOffsetB+1] + +v_mov_b32 v[vgprGlobalReadOffsetB1], v[vgprGlobalReadOffsetB] +v_add_u32 v[vgprGlobalReadOffsetB1+1], s[sgprSizesSum], v[vgprGlobalReadOffsetB+1] + + + +Skip2_K_Sub2048: +Skip2_K4096: +s_cmp_eq_u32 s[sgprTemp7], 1 +s_cmov_b32 s[sgprStrideStruct], 0 + +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprSizesFree+1], s[sgprStridesB] +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB] + +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], 0 +// Set limit to use bytes fp16 = 0 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprStridesB+1], s[sgprWorkGroup2] // Stride*WG +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] // accum wg term to tilestart +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 // accum wg term to tilestart +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp2] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp3] // +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_cmp_eq_u64 s[sgprAddressB:sgprAddressB+1], 0 // s[sgprAddressB] == 0 ? +s_cbranch_scc1 label_SkipMmac + +s_lshr_b32 s[sgprTemp0], s[sgprStridesB], 13 // +s_mul_i32 s[sgprTemp1], s[sgprStridesB], 2 // +s_lshr_b32 s[sgprTemp2], s[sgprTemp1], s[sgprTemp0] // +s_lshl_b32 s[sgprTemp0], s[sgprTemp2], 16 // +s_or_b32 s[sgprTemp1], s[sgprTemp0], 0x40000000 // + +s_cmp_eq_u32 s[sgprTemp7], 1 // +s_cbranch_scc1 Skip1_K4096 // +s_cmp_le_u32 s[sgprSizesSum+0], 512 // +s_cbranch_scc1 Skip1_K_Sub2048 // +s_mov_b32 s[sgprStructNum], 1 // +s_lshl_b32 s[sgprStructNum], s[sgprStructNum], s[sgprStructBit] // +s_or_b32 s[sgprStructNum], s[sgprStructNum], 0x40000000 // +s_mov_b32 s[sgprTemp1], s[sgprStructNum] // +Skip1_K_Sub2048: // +Skip1_K4096: + +s_or_b32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] // struct buffer, + +//Struct index Limit +s_and_b32 s[sgprTemp0], MT1-1, s[sgprSizesFree+1] +s_cmp_eq_u32 s[sgprTemp0], 0 +s_cselect_b32 s[sgprTemp0], MT1, s[sgprTemp0] +s_sub_u32 s[sgprTemp1], s[sgprNumWorkGroups1], 1 +s_cmp_eq_u32 s[sgprWorkGroup1], s[sgprTemp1] +s_cselect_b32 s[sgprTemp0], s[sgprTemp0], MT1 +//s_sub_u32 s[sgprTemp0], s[sgprTemp0], 1 // mcc test edge +s_mov_b32 s[sgprSrdB+2], s[sgprTemp0] + +s_mov_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*8 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +//.set LDS_SUB_M_OFFSET, BPE*32 +.set LDS_SUB_M_OFFSET, 16 +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 4 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 1 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 4 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] + +//get lds read addrA +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 15, v[vgprTemp0] +v_lshrrev_b32 v1, 4, v[vgprTemp0] +v_mul_u32_u24 v2, 0x40, v1 + +v_add_u32 v[vgprLocalReadAddrA], v2, v0 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], LDS_SUB_M_OFFSET +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp0] + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp2], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp2] + +v_add_u32 v[vgprLocalReadAddrA+1], 0x240, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+2], 0x400, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+3], 0x640, v[vgprLocalReadAddrA] + +v_add_u32 v[vgprLocalReadAddrA+4], 0x100, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+5], 0x340, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+6], 0x500, v[vgprLocalReadAddrA] +v_add_u32 v[vgprLocalReadAddrA+7], 0x740, v[vgprLocalReadAddrA] + + +//fixed input: v[vgprTemp0]->added address; +// v[vgprTemp1]->max clips; v[vgprTemp2]->reducer +.macro ADDR_WRAP vaddr:req +v_cmp_ge_u32 s[sgprTemp0:sgprTemp1], \vaddr, v[vgprTemp1] +v_cndmask_b32 v[vgprTemp3], 0, v[vgprTemp2], s[sgprTemp0:sgprTemp1] +v_sub_u32 \vaddr, \vaddr, v[vgprTemp3] +.endm +v_mov_b32 v[vgprTemp2], 0x200 +v_mov_b32 v[vgprTemp1], 0x400 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+5] +v_mov_b32 v[vgprTemp1], 0x800 +v_add_u32 v[vgprTemp1], v[vgprTemp1], s[sgprTemp2] +ADDR_WRAP v[vgprLocalReadAddrA+7] + +.set WAVE_LDS_OFFSET, UNDEF //x4 load +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LDS_SUB_N_OFFSET, BPE*64 +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v[vgprTemp1], v[vgprTemp0], 3 +v_lshrrev_b32 v[vgprTemp2], 4, v[vgprTemp0] +v_mul_u32_u24 v[vgprTemp1], 64, v[vgprTemp1] +v_lshlrev_b32 v[sgprTemp2], 3, v[vgprTemp2] +v_and_b32 v[vgprTemp0], v[vgprTemp0], 15 +v_lshrrev_b32 v[vgprTemp0], 2, v[vgprTemp0] +s_mov_b32 s[sgprTemp1], 0x410 +v_mul_lo_u32 v[vgprTemp0], s[sgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprTemp1], v[vgprTemp1], v[vgprTemp0] +v_add_u32 v[vgprLocalReadAddrB], v[vgprTemp1], v[sgprTemp2] +s_mov_b32 s[sgprTemp1], LDS_B_OFFSET +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_mov_b32 v[vgprLocalReadAddrB_ori], v[vgprLocalReadAddrB] + +v_add_u32 v[vgprLocalReadAddrB+1], 768, v[vgprLocalReadAddrB] +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+1], v4 +v_mov_b32 v5, 1024 // +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], v5 // + +v_add_u32 v[vgprLocalReadAddrB+2], 800, v[vgprLocalReadAddrB] // +v_and_b32 v0, 63, v[vgprSerial] // +v_and_b32 v1, 15, v0 // +v_lshrrev_b32 v2, 2, v1 // +v_lshlrev_b32 v3, 10, v2 // +v_add_u32 v3, LDS_B_OFFSET, v3 // +v_add_u32 v4, 1024, v3 // +v_cmp_ge_u32 s[80:81], v[vgprLocalReadAddrB+2], v4 +v_mov_b32 v5, 1024 +v_cndmask_b32 v5, 0, v5, s[80:81] +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], v5 // + + +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0x8000 +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], s[sgprTemp0] +v_add_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], s[sgprTemp0] + + + + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 + + +/******************************************/ +/* Generate Scale Zeros */ +/******************************************/ +v_and_b32 v[vgprGlobalReadOffsetScale], v[vgprSerial], 15 +v_lshlrev_b32 v[vgprGlobalReadOffsetScale], 0x2, v[vgprGlobalReadOffsetScale] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 64 +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprTemp1] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetScale], s[sgprTemp0], v[vgprGlobalReadOffsetScale] + + +v_and_b32 v[vgprGlobalReadOffsetZero], v[vgprSerial], 15 +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 16 +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_lshr_b32 s[sgprTemp2], s[sgprSizesSum+0], 6 // +//s_lshl_b32 s[sgprTemp1], s[sgprStridesA], 2 +s_mul_i32 s[sgprTemp1], s[sgprTemp2], s[sgprStridesA] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 2 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +v_add_u32 v[vgprGlobalReadOffsetZero], s[sgprTemp0], v[vgprGlobalReadOffsetZero] + + +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0*2 +s_add_u32 s[sgprTemp2], s[sgprShadowLimitA], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cmov_b32 s[sgprShadowLimitA], s[sgprTemp2] +s_cmov_b32 s[sgprShadowLimitA+1], s[sgprTemp3] + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], 8 // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 8 // tlu=0, scaled tile-offset by stride + +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 4 + +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprScale+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 + +s_add_u32 s[sgprScale+0], s[sgprScaleAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprScale+1], s[sgprScaleAddress+1], s[sgprTemp1] // + +s_mov_b32 s[sgprScale+3], Srd127_96 // Set bits 127_96 in SRD +s_lshr_b64 s[sgprTemp2:sgprTemp3], s[sgprShadowLimitA:sgprShadowLimitA+1], 6 + +s_lshr_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], 2 +s_sub_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_subb_u32 s[sgprTemp3], s[sgprTemp3], 0 + +s_cmp_eq_u32 s[sgprTemp3], 0 // are we within 2^32? +s_cselect_b32 s[sgprZero+2], s[sgprTemp2], BufferLimit // Move shadow to real if we are within 2^32 +s_add_u32 s[sgprZero+0], s[sgprZeroAddress+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprZero+1], s[sgprZeroAddress+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprZero+3], Srd127_96 + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADAB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrA1], \offset +buffer_load_dwordx2 v[vgprGlobalReadOffsetA1+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB:vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +s_add_u32 m0, s[sgprLocalWriteAddrB1], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB1:vgprGlobalReadOffsetB1+1], s[sgprSrdB:sgprSrdB+3], 0, idxen offen offset:0, lds + +.endm + +.macro GLOBAL_LOAD_Scale_Zero +buffer_load_ubyte v[vgprValuZeros+0], v[vgprGlobalReadOffsetZero+0], s[sgprZero:sgprZero+3], 0 offen offset:0 +buffer_load_dword v[vgprValuScales+0], v[vgprGlobalReadOffsetScale+0], s[sgprScale:sgprScale+3], 0 offen offset:0 +.endm + +.macro GLOBAL_INC_Scale_Zero +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 3 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprScale+0], s[sgprScale+0], s[sgprTemp0] +s_addc_u32 s[sgprScale+1], s[sgprScale+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprScale+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], 0 +s_cmp_ge_u32 s[sgprScale+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprScale+2], s[sgprTemp2] + +s_lshr_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0], 5 +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprZero+0], s[sgprZero+0], s[sgprTemp0] +s_addc_u32 s[sgprZero+1], s[sgprZero+1], s[sgprTemp1] +s_sub_u32 s[sgprTemp2], s[sgprZero+2], s[sgprTemp0] +s_cmp_lt_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], 0 +s_cmp_ge_u32 s[sgprZero+2], s[sgprTemp0] // are we within 2^32? +s_cmov_b32 s[sgprZero+2], s[sgprTemp2] + +.endm + +.macro UnPackB32ToTwoF32 vgprScale:req vgprOut:req +v_lshlrev_b32 v[\vgprOut+0], 16, v[\vgprScale] +v_lshrrev_b32 v[\vgprOut+0], 16, v[\vgprOut+0] +v_lshrrev_b32 v[\vgprOut+1], 16, v[\vgprScale+0] +v_cvt_f32_f16 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_f16 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro UnPackB8To2F32 vgprZero:req vgprOut:req +v_lshrrev_b32 v[\vgprOut+1], 4, v[\vgprZero] +v_and_b32 v[\vgprOut+0], v[\vgprZero], 0xf +v_and_b32 v[\vgprOut+1], v[\vgprOut+1], 0xf +v_cvt_f32_ubyte0 v[\vgprOut+0], v[\vgprOut+0] +v_cvt_f32_ubyte0 v[\vgprOut+1], v[\vgprOut+1] +.endm + +.macro I4ToFp16 vgprIn:req vgprZero:req vgprScale:req vgprPack:req + +v_lshrrev_b32 v[vgprValuA_X0_H0+1], 4, v[\vgprIn] +v_and_b32 v[vgprValuA_X0_H0+0], v[\vgprIn], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+3], 4, v[\vgprIn+1] +v_and_b32 v[vgprValuA_X0_H0+2], v[\vgprIn+1], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+5], 4, v[\vgprIn+2] +v_and_b32 v[vgprValuA_X0_H0+4], v[\vgprIn+2], 0xf + +v_lshrrev_b32 v[vgprValuA_X0_H0+7], 4, v[\vgprIn+3] +v_and_b32 v[vgprValuA_X0_H0+6], v[\vgprIn+3], 0xf + +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f32_ubyte0 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pk_fma_f32 v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+0:vgprValuA_X0_H0+1], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+2:vgprValuA_X0_H0+3], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+4:vgprValuA_X0_H0+5], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] +v_pk_fma_f32 v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+6:vgprValuA_X0_H0+7], v[\vgprScale:\vgprScale+1], v[\vgprZero:\vgprZero+1] + +v_cvt_f16_f32 v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+0] +v_cvt_f16_f32 v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+1] +v_cvt_f16_f32 v[vgprValuA_X0_H0+2], v[vgprValuA_X0_H0+2] +v_cvt_f16_f32 v[vgprValuA_X0_H0+3], v[vgprValuA_X0_H0+3] +v_cvt_f16_f32 v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+4] +v_cvt_f16_f32 v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+5] +v_cvt_f16_f32 v[vgprValuA_X0_H0+6], v[vgprValuA_X0_H0+6] +v_cvt_f16_f32 v[vgprValuA_X0_H0+7], v[vgprValuA_X0_H0+7] + +v_pack_b32_f16 v[\vgprPack+0], v[vgprValuA_X0_H0+0], v[vgprValuA_X0_H0+2] +v_pack_b32_f16 v[\vgprPack+1], v[vgprValuA_X0_H0+4], v[vgprValuA_X0_H0+6] +v_pack_b32_f16 v[\vgprPack+2], v[vgprValuA_X0_H0+1], v[vgprValuA_X0_H0+3] +v_pack_b32_f16 v[\vgprPack+3], v[vgprValuA_X0_H0+5], v[vgprValuA_X0_H0+7] +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INC + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +//s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +//s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +//s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +//s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADAB off:req + +ds_read_u8 v[vgprValuA_X0_I0+ 0], v[vgprLocalReadAddrA+0] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 1], v[vgprLocalReadAddrA+1] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 2], v[vgprLocalReadAddrA+2] offset:\off +ds_read_u8 v[vgprValuA_X0_I0+ 3], v[vgprLocalReadAddrA+3] offset:\off + +ds_read_b64 v[vgprValuB_X0_I0+ 0:vgprValuB_X0_I0+ 1], v[vgprLocalReadAddrB] offset:\off+0 +ds_read_b64 v[vgprValuB_X0_I0+ 2:vgprValuB_X0_I0+ 3], v[vgprLocalReadAddrB] offset:\off+256 +ds_read_b64 v[vgprValuB_X0_I0+ 4:vgprValuB_X0_I0+ 5], v[vgprLocalReadAddrB] offset:\off+512 +ds_read_b64 v[vgprValuB_X0_I0+ 6:vgprValuB_X0_I0+ 7], v[vgprLocalReadAddrB+1] offset:\off+0 +.endm + +.macro LDS_LOADAB1 off:req + +ds_read_u8 v[vgprValuA_X1_I0+ 0], v[vgprLocalReadAddrA+4] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 1], v[vgprLocalReadAddrA+5] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 2], v[vgprLocalReadAddrA+6] offset:\off +ds_read_u8 v[vgprValuA_X1_I0+ 3], v[vgprLocalReadAddrA+7] offset:\off + +ds_read_b64 v[vgprValuB_X1_I0+ 0:vgprValuB_X1_I0+ 1], v[vgprLocalReadAddrB] offset:\off+32 +ds_read_b64 v[vgprValuB_X1_I0+ 2:vgprValuB_X1_I0+ 3], v[vgprLocalReadAddrB] offset:\off+288 +ds_read_b64 v[vgprValuB_X1_I0+ 4:vgprValuB_X1_I0+ 5], v[vgprLocalReadAddrB] offset:\off+544 +ds_read_b64 v[vgprValuB_X1_I0+ 6:vgprValuB_X1_I0+ 7], v[vgprLocalReadAddrB+2] offset:\off+0 +.endm + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ + +//s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 5 +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 6 + + +s_min_u32 s[sgprLoopCntCommon], 4, s[sgprLoopCounterL] +s_and_b32 s[sgprTemp0], s[sgprSizesSum], 31 +s_add_u32 s[sgprLoopCntCommon], s[sgprLoopCounterL], scc +s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCntCommon], 1 // -1 for tail + +s_cmp_lt_i32 s[sgprWaveID], 8 +s_cbranch_scc1 SkipGL + + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCounterL] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetA1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLocalWriteAddrB1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalReadOffsetB1+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprSrdA+1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + + + +.endif + + + + + +s_cmp_eq_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 SkipToLastLoad +s_mov_b32 s[sgprTemp3], 0 +PreFetchBegin: +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +GLOBAL_INC + +s_addk_i32 s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_addk_i32 s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + + +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + + + + +s_cmp_lt_i32 s[sgprTemp3], 2 +s_cbranch_scc1 SkipWait +s_waitcnt vmcnt(8) +s_barrier +SkipWait: +s_add_u32 s[sgprTemp3], s[sgprTemp3], 1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 1 +s_cbranch_scc1 PreFetchBegin +s_cmp_eq_i32 s[sgprTemp3], 1 +s_cbranch_scc1 Last1 +s_waitcnt vmcnt(4) +s_barrier +Last1: +s_waitcnt vmcnt(0) +s_barrier +s_barrier + +SkipToLastLoad: +s_mov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] +s_mov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] +s_add_u32 s[sgprLocalWriteAddrA1], 0x8000, s[sgprLocalWriteAddrA] +s_add_u32 s[sgprLocalWriteAddrB1], 0x8000, s[sgprLocalWriteAddrB] + +GLOBAL_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt vmcnt(0) +s_barrier +s_endpgm +SkipGL: + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], 128 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprStridesC+1], s[sgprWorkGroup2] +s_mul_i32 s[sgprTemp2], s[sgprStridesC+1], s[sgprWorkGroup2] +s_add_u32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp0] +s_addc_u32 s[sgprTemp3], s[sgprTemp3], 0 +s_lshl_b64 s[sgprTemp2:sgprTemp3], s[sgprTemp2:sgprTemp3], 0x1 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp2] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s[sgprTemp3] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], 1 +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD + +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +s_lshr_b32 s[sgprTemp0], s[sgprWaveID], 0x2 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x5 +v_add_u32 v[vgprTemp1], s[sgprTemp0], v[vgprTemp1] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], 0x2, v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], 0x1, v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_and_b32 s[sgprTemp0], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 32 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 0x1 +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ +GLOBAL_LOAD_Scale_Zero +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 +v_mov_b32 v[vgprValuC+4], 0x0 +v_mov_b32 v[vgprValuC+5], 0x0 +v_mov_b32 v[vgprValuC+6], 0x0 +v_mov_b32 v[vgprValuC+7], 0x0 +v_mov_b32 v[vgprValuC+8], 0x0 +v_mov_b32 v[vgprValuC+9], 0x0 +v_mov_b32 v[vgprValuC+10], 0x0 +v_mov_b32 v[vgprValuC+11], 0x0 +v_mov_b32 v[vgprValuC+12], 0x0 +v_mov_b32 v[vgprValuC+13], 0x0 +v_mov_b32 v[vgprValuC+14], 0x0 +v_mov_b32 v[vgprValuC+15], 0x0 +v_mov_b32 v[vgprValuC+16], 0x0 +v_mov_b32 v[vgprValuC+17], 0x0 +v_mov_b32 v[vgprValuC+18], 0x0 +v_mov_b32 v[vgprValuC+19], 0x0 +v_mov_b32 v[vgprValuC+20], 0x0 +v_mov_b32 v[vgprValuC+21], 0x0 +v_mov_b32 v[vgprValuC+22], 0x0 +v_mov_b32 v[vgprValuC+23], 0x0 +v_mov_b32 v[vgprValuC+24], 0x0 +v_mov_b32 v[vgprValuC+25], 0x0 +v_mov_b32 v[vgprValuC+26], 0x0 +v_mov_b32 v[vgprValuC+27], 0x0 +v_mov_b32 v[vgprValuC+28], 0x0 +v_mov_b32 v[vgprValuC+29], 0x0 +v_mov_b32 v[vgprValuC+30], 0x0 +v_mov_b32 v[vgprValuC+31], 0x0 + +GLOBAL_INC_Scale_Zero +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_waitcnt vmcnt(0) +s_barrier + +LDS_LOADAB LDS_BLK_OFFSET*0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 WaveID_gecase + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW0_3: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +s_barrier +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +WaveID_gecase: + + +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] +MainLoopBeginW4_7: + +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], s[sgprLoopCntCommon] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +.endif + + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*1 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +LDS_LOADAB1 LDS_BLK_OFFSET*2 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +GLOBAL_LOAD_Scale_Zero +LDS_LOADAB1 LDS_BLK_OFFSET*3 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +s_barrier +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +s_waitcnt vmcnt(0) +GLOBAL_INC_Scale_Zero +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(8) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +UnPackB32ToTwoF32 vgprValuScales+0 vgprValuScalesF32+0 +UnPackB8To2F32 vgprValuZeros+0 vgprValuZerosI32+0 +v_mul_f32 v[vgprValuZerosI32+0], -v[vgprValuZerosI32+0], v[vgprValuScalesF32+0] +v_mul_f32 v[vgprValuZerosI32+1], -v[vgprValuZerosI32+1], v[vgprValuScalesF32+1] +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW4_7 + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + +s_waitcnt vmcnt(0) +s_barrier +LDS_LOADAB LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X0_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X2_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_0 +LDS_LOADAB1 LDS_BLK_OFFSET*0 +s_waitcnt lgkmcnt(0) +I4ToFp16 vgprValuA_X1_I0+0 vgprValuZerosI32+0 vgprValuScalesF32+0 vgprValuA_X3_I0 +.align32 8, 0xbf800001 +s_nop (1) +MMAC_32x64_1 + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[16] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + + +v_and_b32 v[vgprTemp1], 63, v[vgprSerial] +v_lshlrev_b32 v[vgprLocalWriteC], 2, v[vgprTemp1] + +s_mul_i32 s[sgprTemp2], 8192, s[sgprWaveID] +v_add_u32 v[vgprLocalWriteC], s[sgprTemp2], v[vgprLocalWriteC] + +s_barrier + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3 +v_add_u32 v[vgprLocalReadC], 0x8000, v[vgprLocalWriteC] +Skip_Wave0_3: + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7 +s_mov_b32 s[sgprTemp2], 0x8000 +v_sub_u32 v[vgprLocalReadC], v[vgprLocalWriteC], s[sgprTemp2] +Skip_Wave4_7: + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc1 Skip_Wave0_3_W +ds_write_b32 v[vgprLocalWriteC], v[16], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[20], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[17], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[21], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[18], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[22], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[19], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[23], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[24], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[28], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[25], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[29], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[26], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[30], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[27], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[31], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier +ds_read_b32 v[vgprTmpValC+16], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+20], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+17], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+21], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+18], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+22], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+19], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+23], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+24], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+28], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+25], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+29], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+26], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+30], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+27], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+31], v[vgprLocalReadC+0] offset:3840 + +s_waitcnt lgkmcnt(0) +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+16] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+20] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+17] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+21] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+18] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+22] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+19] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+23] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+24] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+28] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+25] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+29] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+26] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+30] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+27] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+31] +Skip_Wave0_3_W: + + + +s_cmp_ge_u32 s[sgprWaveID], 4 +s_cbranch_scc0 Skip_Wave4_7_W + +ds_write_b32 v[vgprLocalWriteC], v[0], offset:0 +ds_write_b32 v[vgprLocalWriteC], v[4], offset:256 +ds_write_b32 v[vgprLocalWriteC], v[1], offset:512 +ds_write_b32 v[vgprLocalWriteC], v[5], offset:768 +ds_write_b32 v[vgprLocalWriteC], v[2], offset:1024 +ds_write_b32 v[vgprLocalWriteC], v[6], offset:1280 +ds_write_b32 v[vgprLocalWriteC], v[3], offset:1536 +ds_write_b32 v[vgprLocalWriteC], v[7], offset:1792 + +ds_write_b32 v[vgprLocalWriteC], v[8], offset:2048 +ds_write_b32 v[vgprLocalWriteC], v[12], offset:2304 +ds_write_b32 v[vgprLocalWriteC], v[9], offset:2560 +ds_write_b32 v[vgprLocalWriteC], v[13], offset:2816 +ds_write_b32 v[vgprLocalWriteC], v[10], offset:3072 +ds_write_b32 v[vgprLocalWriteC], v[14], offset:3328 +ds_write_b32 v[vgprLocalWriteC], v[11], offset:3584 +ds_write_b32 v[vgprLocalWriteC], v[15], offset:3840 + +s_waitcnt lgkmcnt(0) +s_barrier + + +ds_read_b32 v[vgprTmpValC+0], v[vgprLocalReadC+0] offset:0 +ds_read_b32 v[vgprTmpValC+4], v[vgprLocalReadC+0] offset:256 +ds_read_b32 v[vgprTmpValC+1], v[vgprLocalReadC+0] offset:512 +ds_read_b32 v[vgprTmpValC+5], v[vgprLocalReadC+0] offset:768 +ds_read_b32 v[vgprTmpValC+2], v[vgprLocalReadC+0] offset:1024 +ds_read_b32 v[vgprTmpValC+6], v[vgprLocalReadC+0] offset:1280 +ds_read_b32 v[vgprTmpValC+3], v[vgprLocalReadC+0] offset:1536 +ds_read_b32 v[vgprTmpValC+7], v[vgprLocalReadC+0] offset:1792 +ds_read_b32 v[vgprTmpValC+8], v[vgprLocalReadC+0] offset:2048 +ds_read_b32 v[vgprTmpValC+12], v[vgprLocalReadC+0] offset:2304 +ds_read_b32 v[vgprTmpValC+9], v[vgprLocalReadC+0] offset:2560 +ds_read_b32 v[vgprTmpValC+13], v[vgprLocalReadC+0] offset:2816 +ds_read_b32 v[vgprTmpValC+10], v[vgprLocalReadC+0] offset:3072 +ds_read_b32 v[vgprTmpValC+14], v[vgprLocalReadC+0] offset:3328 +ds_read_b32 v[vgprTmpValC+11], v[vgprLocalReadC+0] offset:3584 +ds_read_b32 v[vgprTmpValC+15], v[vgprLocalReadC+0] offset:3840 + +s_waitcnt lgkmcnt(0) + +v_add_f32 v[vgprTmpValC+0], v[vgprTmpValC+0], v[vgprTmpValC+16] +v_add_f32 v[vgprTmpValC+4], v[vgprTmpValC+4], v[vgprTmpValC+20] +v_add_f32 v[vgprTmpValC+1], v[vgprTmpValC+1], v[vgprTmpValC+17] +v_add_f32 v[vgprTmpValC+5], v[vgprTmpValC+5], v[vgprTmpValC+21] +v_add_f32 v[vgprTmpValC+2], v[vgprTmpValC+2], v[vgprTmpValC+18] +v_add_f32 v[vgprTmpValC+6], v[vgprTmpValC+6], v[vgprTmpValC+22] +v_add_f32 v[vgprTmpValC+3], v[vgprTmpValC+3], v[vgprTmpValC+19] +v_add_f32 v[vgprTmpValC+7], v[vgprTmpValC+7], v[vgprTmpValC+23] + +v_add_f32 v[vgprTmpValC+8], v[vgprTmpValC+8], v[vgprTmpValC+24] +v_add_f32 v[vgprTmpValC+12], v[vgprTmpValC+12], v[vgprTmpValC+28] +v_add_f32 v[vgprTmpValC+9], v[vgprTmpValC+9], v[vgprTmpValC+25] +v_add_f32 v[vgprTmpValC+13], v[vgprTmpValC+13], v[vgprTmpValC+29] +v_add_f32 v[vgprTmpValC+10], v[vgprTmpValC+10], v[vgprTmpValC+26] +v_add_f32 v[vgprTmpValC+14], v[vgprTmpValC+14], v[vgprTmpValC+30] +v_add_f32 v[vgprTmpValC+11], v[vgprTmpValC+11], v[vgprTmpValC+27] +v_add_f32 v[vgprTmpValC+15], v[vgprTmpValC+15], v[vgprTmpValC+31] + +/* +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprLocalWriteC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprLocalReadC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprTmpValC] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +*/ + +Skip_Wave4_7_W: + + + + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +label_SkipMmac: + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +Beta_eqcase: +s_lshl_b32 s[sgprSizesFree+0], s[sgprSizesFree+0], 2 +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], 128 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], 128*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_and_b32 s[sgprTemp1], s[sgprWaveID], 3 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], 32 +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprGlobalWriteOffsetD] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc + +v_mov_b32 v[vgprDebugTmp], v[vgprValuC+Nvoff+0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 8 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 9 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 10 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 11 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +/* +.set Nvoff, 16 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 17 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 18 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 19 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 24 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 25 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 26 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 2*4 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 + +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 27 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D + +v_add_u32 v[vgprTemp1], 2, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 2, v[vgprTemp2] +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+4], s[sgprAlpha], v[vgprValuC+Nvoff+4] +v_cvt_f16_f32 v[vgprValuC+Nvoff+4], v[vgprValuC+Nvoff+4] +buffer_store_short v[vgprValuC+Nvoff+4], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +*/ +Beta_EndSwitch: +s_endpgm diff --git a/asm_kernels/Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_awqGemm.s b/asm_kernels/Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_awqGemm.s new file mode 100644 index 0000000000000000000000000000000000000000..18744b5a03b87416c68710bf3c6d117cb9c2e7b3 --- /dev/null +++ b/asm_kernels/Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_awqGemm.s @@ -0,0 +1,1431 @@ + +/******************************************/ +/* Begin Kernel */ +/******************************************/ +.amdgcn_target "amdgcn-amd-amdhsa--gfx936" +.text +.protected Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_WG16_16_2 +.globl Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_WG16_16_2 +.p2align 8 +.type Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_WG16_16_2,@function +.section .rodata,#alloc +.p2align 6 +.amdhsa_kernel Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_WG16_16_2 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_next_free_vgpr 94 // vgprs + .amdhsa_next_free_sgpr 101 // sgprs + .amdhsa_group_segment_fixed_size 65536 // lds bytes + .amdhsa_private_segment_fixed_size 0 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_float_denorm_mode_32 3 + .amdhsa_float_denorm_mode_16_64 3 +.end_amdhsa_kernel +.text +/* Num VGPR =36 */ +/* Num AccVGPR=0 */ +/* Num SGPR =101 */ + +/******************************************/ +/* Optimizations and Config: */ +/******************************************/ +/* ThreadTile= 2 x 2 */ +/* SubGroup= 16 x 16 */ +/* VectorWidthA=-1 */ +/* VectorWidthB=-1 */ +/* GlobalReadVectorWidthA=1, GlobalReadVectorWidthB=1 */ +/* DirectToLdsA=1 */ +/* DirectToLdsB=1 */ +/* UseSgprForGRO=0 */ +.amdgpu_metadata +--- +custom.config: + InternalSupportParams: + KernArgsVersion: 2 +amdhsa.version: + - 1 + - 1 +amdhsa.kernels: + - .name: Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_WG16_16_2 + .symbol: 'Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_WG16_16_2.kd' + .language: OpenCL C + .language_version: + - 2 + - 0 + .args: + - .name: Gemm info + .size: 4 + .offset: 0 + .value_kind: by_value + .value_type: u32 + - .name: kernel info0 + .size: 4 + .offset: 4 + .value_kind: by_value + .value_type: u32 + - .name: kernel info1 + .size: 4 + .offset: 8 + .value_kind: by_value + .value_type: u32 + - .name: numWG + .size: 4 + .offset: 12 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree0 + .size: 4 + .offset: 16 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree1 + .size: 4 + .offset: 20 + .value_kind: by_value + .value_type: u32 + - .name: SizesFree2 + .size: 4 + .offset: 24 + .value_kind: by_value + .value_type: u32 + - .name: SizesSum0 + .size: 4 + .offset: 28 + .value_kind: by_value + .value_type: u32 + - .name: D + .size: 8 + .offset: 32 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: C + .size: 8 + .offset: 40 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: A + .size: 8 + .offset: 48 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: B + .size: 8 + .offset: 56 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: strideD0 + .size: 4 + .offset: 64 + .value_kind: by_value + .value_type: u32 + - .name: strideD1 + .size: 4 + .offset: 68 + .value_kind: by_value + .value_type: u32 + - .name: strideC0 + .size: 4 + .offset: 72 + .value_kind: by_value + .value_type: u32 + - .name: strideC1 + .size: 4 + .offset: 76 + .value_kind: by_value + .value_type: u32 + - .name: strideA0 + .size: 4 + .offset: 80 + .value_kind: by_value + .value_type: u32 + - .name: strideA1 + .size: 4 + .offset: 84 + .value_kind: by_value + .value_type: u32 + - .name: strideB0 + .size: 4 + .offset: 88 + .value_kind: by_value + .value_type: u32 + - .name: strideB1 + .size: 4 + .offset: 92 + .value_kind: by_value + .value_type: u32 + - .name: alpha + .size: 4 + .offset: 96 + .value_kind: by_value + .value_type: f32 + - .name: beta + .size: 4 + .offset: 100 + .value_kind: by_value + .value_type: f32 + - .name: AddressDbg + .size: 8 + .offset: 104 + .value_kind: global_buffer + .value_type: struct + .address_space: generic + - .name: dstD + .size: 8 + .offset: 112 + .value_kind: global_buffer + .value_type: f16 + .address_space: generic + - .name: Synchronizer + .size: 8 + .offset: 120 + .value_kind: global_buffer + .value_type: f32 + .address_space: generic + - .name: GSUSync + .size: 4 + .offset: 128 + .value_kind: by_value + .value_type: u32 + .group_segment_fixed_size: 65536 + .kernarg_segment_align: 8 + .kernarg_segment_size: 136 + .max_flat_workgroup_size: 512 + .private_segment_fixed_size: 0 + .sgpr_count: 101 + .sgpr_spill_count: 0 + .vgpr_count: 90 + .vgpr_spill_count: 0 + .wavefront_size: 64 +... +.end_amdgpu_metadata +Cijk_Alik_Bljk_HHS_BH_UserArgs_MT32x32x128_SN_K1_PGR4_TT2_2_WG16_16_2: +label_ASM_Start: /// Main body of the asm kernel + +.set vgprValuC, 0 +.set vgprValuA_X0_I0, 4 +.set vgprValuA_X1_I0, 4 +.set vgprValuB_X0_I0, 36 +.set vgprValuB_X1_I0, 36 + +//user define +.set vgprTemp0, 68 +.set vgprTemp1, 69 +.set vgprTemp2, 70 +.set vgprTemp3, 71 +.set vgprGlobalWriteOffsetD, 72 +.set vgprLocalReadAddrA, 88 +.set vgprLocalReadAddrB, 91 +.set vgprKeepSgprValue, 79 +.set vgprSerial, 80 +.set vgprAddressDbg, 81 //debugbuffer +.set vgprDebugTmp, 83 //debugbuffer +.set vgprGlobalReadOffsetA, 84 +.set vgprGlobalReadOffsetB, 86 + + +.set BufferLimit, 0xffffffff //0xffffffff +.set BufferOOB, 0x80000000 +.set Srd127_96, 0x00020000 +.set laneSrdA0, 0 +.set laneSrdA1, 1 +.set laneSrdA2, 2 +.set laneAddressB0, 4 +.set laneAddressB1, 5 +.set laneAddressC0, 6 +.set laneAddressC1, 7 +.set laneAddressD0, 8 +.set laneAddressD1, 9 +.set laneSizesFree1, 12 +.set laneWorkGroup1, 13 +.set laneStrideD, 14 +.set laneLoopCnt, 15 + +.set laneAddressWSA0, 16 +.set laneAddressWSA1, 17 +.set laneAddressWSA2, 18 +.set laneAddressWSA3, 19 +.set laneAddressWSB0, 20 +.set laneAddressWSB1, 21 +.set laneAddressWSB2, 22 +.set laneAddressWSB3, 23 + +.set laneAddressDout0, 24 +.set laneAddressDout1, 25 +.set laneAddressSync0, 26 +.set laneAddressSync1, 27 +.set laneNumGroup, 28 +.set laneIncA, 29 +.set laneIncB, 30 +.set laneWorkGroup1, 31 + +.set laneSrdB0, 32 +.set laneSrdB1, 33 +.set laneSrdB2, 34 +.set laneldsWrA, 35 +.set laneldsWrB, 36 +.set lanelimitA0, 37 +.set lanelimitA1, 38 +.set lanelimitB0, 39 +.set lanelimitB1, 40 +.set laneAddressWSD0, 41 +.set laneAddressWSD1, 42 +.set laneAddressWSD2, 43 +.set laneAddressWSD3, 44 + + +//Args define +.set sgprKernArgAddress, 0 +.set sgprWorkGroup0, 2 +.set sgprWorkGroup1, 3 +.set sgprWorkGroup2, 4 +.set sgprArgType, 5 +.set sgprGSUSumIdx, 6 +.set sgprNumWorkGroups1, 7 +.set sgprGSULog2BpeC, 8 +.set sgprGSULog2BpeD, 9 +.set sgprStaggerU, 10 +.set sgprLoopCounterL, 11 +.set sgprGemmCount, 12 +.set sgprGSU, 13 +.set sgprWGM, 14 +.set sgprNumWorkGroups0, 15 +.set sgprSrdD, 16 +.set sgprSrdC, 20 + +.set sgprSizesFree, 24 +.set sgprSizesSum, 27 +.set sgprAddressD, 28 +.set sgprAddressC, 30 +.set sgprAddressA, 32 +.set sgprAddressB, 34 +.set sgprStridesD, 36 +.set sgprStridesC, 38 +.set sgprStridesA, 40 +.set sgprStridesB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 45 +.set sgprWGMBuffer, 46 +.set sgprAddressDbg, 48 //定义debug buffer address + + +//user define +.set sgprWorkGroup0Ori, 50 +.set sgprLoopCntCommon, 51 +.set sgprSrdA, 52 +.set sgprSrdB, 56 +.set sgprShadowLimitA, 60 +.set sgprShadowLimitB, 62 +//.set sgprStaggerUIter, 49 +//.set sgprWrapUA, 62 +//.set sgprWrapUB, 64 +.set sgprGlobalReadIncsA, 66 +.set sgprGlobalReadIncsB, 67 +.set sgprPackKForV0, 68 +.set sgprPackKForV1, 69 +.set sgprPackKForV2, 70 +.set sgprPackKForV3, 71 +.set sgprLocalWriteAddrA, 72 +.set sgprLocalWriteAddrB, 73 +.set sgprWaveID, 74 +.set sgprLDSMask, 75 +.set sgprLoopforPfIter, 76 + +.set sgprLDSWriteIter, 78 +.set sgprTemp0, 80 +.set sgprTemp1, 81 +.set sgprTemp2, 82 +.set sgprTemp3, 83 +.set sgprTensor2dSizeA, 84 +.set sgprTensor2dSizeB, 86 +.set sgprWaveID_M, 88 +.set sgprWaveID_N, 89 +.set sgprLocalWriteAddrAori, 90 +.set sgprLocalWriteAddrBori, 91 +.set sgprGlWaveID, 92 +.set sgprD_MEdge, 93 +.set sgprTemp4, 96 +.set sgprTemp5, 97 +.set sgprTemp6, 98 +.set sgprTemp7, 99 +.set sgprStrideStructOffset, 100 +.set sgprStructStrideA, 76 +.set sgprStructStrideB, 77 + + + +.macro MMAC_16x16_part0_0 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+1] v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+1] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +s_setprio 1 +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+2:vgprValuA_X0_I0+3] v[vgprValuB_X0_I0+2:vgprValuB_X0_I0+3] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+5] v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+5] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+6:vgprValuA_X0_I0+7] v[vgprValuB_X0_I0+6:vgprValuB_X0_I0+7] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+9] v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+9] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+10:vgprValuA_X0_I0+11] v[vgprValuB_X0_I0+10:vgprValuB_X0_I0+11] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+13] v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+13] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +v_mmac_f32_16x16x16_f16 v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3] v[vgprValuA_X0_I0+14:vgprValuA_X0_I0+15] v[vgprValuB_X0_I0+14:vgprValuB_X0_I0+15] v[vgprValuC+0+0:vgprValuC+0+1:vgprValuC+0+2:vgprValuC+0+3]// +s_setprio 0 +.endm + +.set MT0, 32 +.set MT1, 32 +//.set LDS_B_OFFSET, 4096 +//.set LDS_BLK_OFFSET, 8192 +//.set LDS_BLK_OFFSET_64Kmasked, 8192 +.set LDS_B_OFFSET, 8192 +.set LDS_BLK_OFFSET, 16384 +.set LDS_BLK_OFFSET_64Kmasked, 16384 +.set LOG2BPE, 1 +.set BPE, 2 +.set Log2BpeDest, 1 +.set LOG2BpeCompute, 2 +.set DEPTHU, 128 +.set LOG2DEPTHU, 7 +.set PFTLOOPS, 4 +.set GLWAVES, 4 +.set LOG2GLWAVES, 2 +.set MperWAVE, 16 +.set NperWAVE, 16 + +/* Load num of Gemms */ +s_load_dword s[sgprGemmCount], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 + +/* Load GSU data */ +s_load_dword s[sgprGSU], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4 + +/* Load WGM data */ +s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 + +s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 16 // Shift common args +s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0 +s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0 +s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40 +s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50 +s_load_dwordx2 s[sgprAddressDbg:sgprAddressDbg+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58 +s_waitcnt lgkmcnt(0) +s_mov_b32 s[sgprWorkGroup0Ori], s[sgprWorkGroup0] + +s_and_b32 s[sgprStaggerU], s[sgprGSU], 0xffff0000 // Restore StaggerU related vars +s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10 +s_and_b32 s[sgprGSU], s[sgprGSU], 0xffff // Restore GSUConfig and GSU +v_mov_b32 v[vgprSerial], v0 +v_readfirstlane_b32 s[sgprWaveID], v[vgprSerial] +s_lshr_b32 s[sgprWaveID], s[sgprWaveID], 6 +s_and_b32 s[sgprGlWaveID], s[sgprWaveID], GLWAVES-1 +s_and_b32 s[sgprWaveID_M], s[sgprWaveID], 1 +s_lshr_b32 s[sgprWaveID_N], s[sgprWaveID], 1 +s_mov_b32 s[sgprLDSMask], 0x10000 + + +.set debug_buffer, 1 +.if debug_buffer //计算每个 workgroup 的 debugbuffer 的地址偏移 +v_mov_b32 v4, s[sgprWorkGroup0Ori] // v1=wg1*nwg0+wg0 +v_lshlrev_b32 v5, 0x8, v4 // v1 = v1 * 256 //这里是 thread 总数,随着wave数量自行更改 +v_mul_lo_u32 v6, 2, v5 +v_add_u32 v7, v6, v[vgprSerial] // v1=tid+NT*(wg1*nwg0+wg0)=serial +v_mul_lo_u32 v8, 0x40, v7 // v1=serial*nipt*4 +v_mov_b32 v2, 0 // +v_mov_b32 v3, s[sgprAddressDbg+1] // v3=AddressD1 +v_add_co_u32 v[vgprAddressDbg], vcc, s[sgprAddressDbg], v8 // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +v_addc_co_u32 v[vgprAddressDbg+1], vcc, v3, v2, vcc // v[vgprAddressDbg]=AddrD* + serial*nipt*4 +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], v[vgprSerial] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup0] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], s[sgprWorkGroup1] +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif + +.if debug_buffer +v_mov_b32 v[vgprDebugTmp], 0xaaaa +flat_store_dword v[vgprAddressDbg:vgprAddressDbg+1], v[vgprDebugTmp] // debug dump store +v_add_u32 v[vgprAddressDbg], v[vgprAddressDbg], 0x4 // debug dump inc +.endif +/******************************************/ +/* Compute GroupID */ +/******************************************/ + +v_mov_b32 v8, MT0 // set MT0 into sgpr +v_mov_b32 v7, s[sgprSizesFree+0] // set Free0 size +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_mov_b32 v8, MT1 // set MT1 into sgpr +v_mov_b32 v7, s[sgprSizesFree+1] // set Free1 size +v_readfirstlane_b32 s[sgprNumWorkGroups0], v6 // set back to numWorkGroup0 +v_cvt_f32_u32 v6, v8 // v6 = ceil(v7 / v8) +v_rcp_iflag_f32 v6, v6 // v6 = ceil(v7 / v8) +v_cvt_f32_u32 v9, v7 // v6 = ceil(v7 / v8) +v_mul_f32 v6, v6, v9 // v6 = ceil(v7 / v8) +v_cvt_u32_f32 v6, v6 // v6 = ceil(v7 / v8) +v_mul_u32_u24 v9, v6, v8 // v6 = ceil(v7 / v8) +v_sub_u32 v9, v7, v9 // v6 = ceil(v7 / v8) +v_cmp_ne_u32 vcc, v9, 0 // v6 = ceil(v7 / v8) +v_addc_co_u32 v6, vcc, v6, 0, vcc // ceil +v_readfirstlane_b32 s[sgprNumWorkGroups1], v6 // set back to numWorkGroup1 + +/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */ +/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */ +s_mul_i32 s78, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1] +s_and_b32 s79, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s78, s78, s79 +v_cvt_f32_u32 v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s78 +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s78 +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s78 +v_mul_u32_u24 v7, v6, s78 // s78 = s[sgprWorkGroup0] / s78 +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s78 +v_cmpx_eq_u32 exec, v7, s78 // s78 = s[sgprWorkGroup0] / s78 +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s78 +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s78 +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup2], s78 +/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */ +s_mul_i32 s78, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0] +s_mul_i32 s78, s78, s[sgprWorkGroup2] +s_mul_i32 s78, s78, s79 +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 +/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */ +v_cvt_f32_u32 v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_rcp_iflag_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_f32_u32 v7, s[sgprWorkGroup0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_f32 v6, v6, v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cvt_u32_f32 v6, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_sub_u32 v7, s[sgprWorkGroup0], v7 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0] // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_add_u32 v6, 1, v6 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +s_mov_b64 exec, -1 // s78 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0] +v_readfirstlane_b32 s78, v6 // quotient +s_mov_b32 s[sgprWorkGroup1], s78 +/* wg0 = idxWG01 - wg1 * numWG0 */ +s_mul_i32 s78, s[sgprWorkGroup1], s[sgprNumWorkGroups0] +s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s78 + +/******************************************/ +/* WrokGroup Mapping */ +/******************************************/ + +/* graWorkGroup mapping */ + +s_and_b32 s80, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s80, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU // branch if GSU == 1 +// GSU-not-WGMapRR :nwg1 = (size1J + MT1J - 1) / MT1J; +s_and_b32 s80, s[sgprGSU], 0x4000 // SCC = (GSUWGMRR == 1) ? +s_cbranch_scc1 label_GSUWGMRR // branch if GSUWGMRR == 1 +s_and_b32 s80, s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v6, s80 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_mul_u32_u24 v7, v6, s80 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_sub_u32 v7, s[sgprWorkGroup1], v7 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_cmpx_eq_u32 exec, v7, s80 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_mov_b32 v7, 0 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] % s80 +s_mov_b64 exec, -1 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80 +v_readfirstlane_b32 s[sgprWorkGroup1], v6 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx], v7 // remainder +s_branch label_GSUWGMRR_End +label_GSUWGMRR: +v_cvt_f32_u32 v6, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_rcp_iflag_f32 v6, v6 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_f32 v6, v6, v7 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cvt_u32_f32 v6, v6 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_sub_u32 v7, s[sgprWorkGroup1], v7 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups1] // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_add_u32 v6, 1, v6 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s[sgprWorkGroup1] % s[sgprNumWorkGroups1] +s_mov_b64 exec, -1 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1] +v_readfirstlane_b32 s[sgprGSUSumIdx], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +label_GSUWGMRR_End: +s_mov_b32 s[sgprGSULog2BpeC], Log2BpeDest +s_mov_b32 s[sgprGSULog2BpeD], LOG2BpeCompute +s_branch label_GSU_End +label_GSU: +s_mov_b64 s[sgprGSUSumIdx:sgprGSUSumIdx+1], 0 // Set GSUSumIdx to 0 +s_mov_b32 s[sgprGSULog2BpeC], Log2BpeDest +s_mov_b32 s[sgprGSULog2BpeD], Log2BpeDest +label_GSU_End: + + +s_cmp_le_i32 s[sgprWGM], 1 +s_cbranch_scc1 label_WGM +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprWorkGroup1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprWorkGroup1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s76, v6 // quotient +s_mul_i32 s77, s76, s[sgprWGM] // quotient * non-magic divisor +s_sub_u32 s77, s[sgprWorkGroup1], s77 // WorkGroup1=remainder +s_mul_i32 s77, s77, s[sgprNumWorkGroups0] // (wg1 % WGM)*NumWorkGroups0 +s_add_u32 s77, s77, s[sgprWorkGroup0] // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0 +v_cvt_f32_u32 v6, s[sgprWGM] // WGM +v_rcp_iflag_f32 v6, v6 // WGM +v_cvt_f32_u32 v7, s[sgprNumWorkGroups1] // WGM +v_mul_f32 v6, v6, v7 // WGM +v_cvt_u32_f32 v6, v6 // WGM +v_mul_u32_u24 v7, v6, s[sgprWGM] // WGM +v_sub_u32 v7, s[sgprNumWorkGroups1], v7 // WGM +v_cmpx_eq_u32 exec, v7, s[sgprWGM] // WGM +v_add_u32 v6, 1, v6 // WGM +s_mov_b64 exec, -1 // WGM +v_readfirstlane_b32 s78, v6 // quotient +s_mul_i32 s79, s[sgprWGM], s78 // quotient * non-magic divisor +s_sub_u32 s79, s[sgprNumWorkGroups1], s79 // NumWorkGroups1=remainder +s_cmp_eq_u32 s79, 0 // remainder == 0 ? +s_cmov_b32 s79, s[sgprWGM] // remainder = WGM if remainder == 0 +s_cmp_ge_u32 s76, s78 // blockId >= numFullBlocks ? +s_cselect_b32 s78, s79, s[sgprWGM] +v_cvt_f32_u32 v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_rcp_iflag_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_f32_u32 v7, s77 // s[sgprWorkGroup0] = s77 / s78 +v_mul_f32 v6, v6, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cvt_u32_f32 v6, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mul_u32_u24 v7, v6, s78 // s[sgprWorkGroup0] = s77 / s78 +v_sub_u32 v7, s77, v7 // s[sgprWorkGroup0] = s77 / s78 +v_cmpx_eq_u32 exec, v7, s78 // s[sgprWorkGroup0] = s77 / s78 +v_add_u32 v6, 1, v6 // s[sgprWorkGroup0] = s77 / s78 +v_mov_b32 v7, 0 // s[sgprWorkGroup1] = s77 % s78 +s_mov_b64 exec, -1 // s[sgprWorkGroup0] = s77 / s78 +v_readfirstlane_b32 s[sgprWorkGroup0], v6 // quotient +v_readfirstlane_b32 s[sgprWorkGroup1], v7 // remainder +s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s78 // quotient * non-magic divisor +s_sub_u32 s[sgprWorkGroup1], s77, s[sgprWorkGroup1] // WorkGroup1=remainder +s_mul_i32 s76, s76, s[sgprWGM] // blockId * WGM +s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s76 // wg1 += blockId * WGM +label_WGM: + +/******************************************/ +/* Generate Global A parameters ... */ +/******************************************/ +.set COALESCE_THREAD_A, 8 //x4 load +.set LOG2_COALESCE_THREAD_A, 3 //x4 load + + +v_and_b32 v[vgprTemp0], 255, v[vgprSerial] // +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp0] //wave ID +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 //set to 0~63 +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] //0..0 1..1 2..2 3..3 ... 7..7 +v_lshlrev_b32 v[vgprTemp0], 2, v[vgprTemp0] //0..0 4..4 8..8 12..12 ... 28..28 +v_add_u32 v0, v[vgprTemp1], v[vgprTemp0] + +v_and_b32 v1, 15, v[vgprSerial] //COALESCE_THREAD +v_lshlrev_b32 v1, 4, v1 + + +/* global read addresses: Perp offsets*/ +v_mov_b32 v2 v0 + +/* global read addresses: Coalesce offsets*/ +v_mov_b32 v3 v1 + +/* global read addresses: final offsets*/ + +v_mul_lo_u32 v[vgprTemp0], s[sgprStridesA], v2 +v_lshlrev_b32 v[vgprTemp0], 1, v[vgprTemp0] +v_add_co_u32 v[vgprGlobalReadOffsetA+0], vcc, v3, v[vgprTemp0] + +v_add_u32 v2, 16, v2 +v_mul_lo_u32 v[vgprTemp0], s[sgprStridesA], v2 +v_lshlrev_b32 v[vgprTemp0], 1, v[vgprTemp0] +v_add_co_u32 v[vgprGlobalReadOffsetA+1], vcc, v3, v[vgprTemp0] + + + +/******************************************/ +/* Generate Global B parameters ... */ +/******************************************/ +.set COALESCE_THREAD_B, 8 //x4 load +.set LOG2_COALESCE_THREAD_B, 3 //x4 load + + +v_and_b32 v[vgprTemp0], 255, v[vgprSerial] // +v_lshrrev_b32 v[vgprTemp1], 6, v[vgprTemp0] //wave ID +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 //set to 0~63 +v_lshrrev_b32 v[vgprTemp0], 4, v[vgprTemp0] //0..0 1..1 2..2 3..3 ... 7..7 +v_lshlrev_b32 v[vgprTemp0], 2, v[vgprTemp0] //0..0 4..4 8..8 12..12 ... 28..28 +v_add_u32 v0, v[vgprTemp1], v[vgprTemp0] + +v_and_b32 v1, 15, v[vgprSerial] //COALESCE_THREAD +v_lshlrev_b32 v1, 4, v1 + + + +/* global read addresses: Perp offsets*/ +v_mov_b32 v2 v0 + +/* global read addresses: Coalesce offsets*/ +v_mov_b32 v3 v1 + +/* global read addresses: final offsets*/ + +v_mul_lo_u32 v[vgprTemp0], s[sgprStridesB], v2 +v_lshlrev_b32 v[vgprTemp0], 1, v[vgprTemp0] +v_add_co_u32 v[vgprGlobalReadOffsetB+0], vcc, v3, v[vgprTemp0] + +v_add_u32 v2, 16, v2 +v_mul_lo_u32 v[vgprTemp0], s[sgprStridesB], v2 +v_lshlrev_b32 v[vgprTemp0], 1, v[vgprTemp0] +v_add_co_u32 v[vgprGlobalReadOffsetB+1], vcc, v3, v[vgprTemp0] + +//v_add_u32 v2, 8, v2 +//v_mul_lo_u32 v[vgprTemp0], s[sgprStridesB], v2 +//v_lshlrev_b32 v[vgprTemp0], 1, v[vgprTemp0] +//v_add_co_u32 v[vgprGlobalReadOffsetB+2], vcc, v3, v[vgprTemp0] + +//v_add_u32 v2, 8, v2 +//v_mul_lo_u32 v[vgprTemp0], s[sgprStridesB], v2 +//v_lshlrev_b32 v[vgprTemp0], 1, v[vgprTemp0] +//v_add_co_u32 v[vgprGlobalReadOffsetB+3], vcc, v3, v[vgprTemp0] + + +/******************************************/ +/* Generate Srd A/B parameters ... */ +/******************************************/ + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup0], MT0 // WorkGroup[00] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 // WorkGroup[00] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesA+0] // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesA+0] // tlu=0, scaled tile-offset by stride + +// GSU processing +s_and_b32 s[sgprTemp2], s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_A // branch if GSUC == 1 +s_mul_hi_u32 s[sgprTemp3], DEPTHU, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s[sgprTemp2], DEPTHU, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_A_End +label_GSUC_A: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], LOG2DEPTHU // s[LoopCounterL] = s[sgprSizesSum] / DEPTHU +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v0, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v0, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v1, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v0, v0, v1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v0, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v1, v0, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v1, s[sgprLoopCounterL], v1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v1, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v0, 1, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v1, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_readfirstlane_b32 s[sgprLoopCounterL], v0 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v1 // remainder +s_mul_i32 s[sgprTemp3], s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s[sgprTemp2], 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s[sgprTemp3], s[sgprTemp3], s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s[sgprTemp2], s[sgprTemp2], s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp3] // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s[sgprTemp3], s[sgprTemp2], DEPTHU // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s[sgprTemp2], s[sgprTemp2], DEPTHU // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_A_End: +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp2] // accum GsuOffset term to tilestart +s_addc_u32 s[sgprTemp1], s[sgprTemp1], s[sgprTemp3] // accum GsuOffset term to tilestart + + +s_mul_hi_u32 s[sgprTensor2dSizeA+1], s[sgprStridesA+0], s[sgprSizesFree+0] +s_mul_i32 s[sgprTensor2dSizeA+0], s[sgprStridesA+0], s[sgprSizesFree+0] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprWorkGroup2], s[sgprStridesA+1] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprWorkGroup2], s[sgprStridesA+1] // Stride*WG +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp2] // accum wg term to tilestart +s_addc_u32 s[sgprTemp1], s[sgprTemp1], s[sgprTemp3] // accum wg term to tilestart +// Set limit to use bytes fp16 = 1 +s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], LOG2BPE + + +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 +s_lshl_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], LOG2BPE +s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdA+3], Srd127_96 // Set bits 127_96 in SRD + + +s_and_b32 s80, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s80, s80, DEPTHU*BPE // GSU*DEPTHU*BPE +s_and_b32 s81, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsA+0], DEPTHU*BPE, s80 //depthU*PEB + + +s_mul_hi_u32 s[sgprTemp1], s[sgprWorkGroup1], MT1 // WorkGroup[01] * MT +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup1], MT1 // WorkGroup[01] * MT +s_mul_hi_u32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesB+0] // tlu=0, scaled tile-offset by stride +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesB+0] // tlu=0, scaled tile-offset by stride + +// GSU processing +s_and_b32 s[sgprTemp2], s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cbranch_scc1 label_GSUC_B // branch if GSUC == 1 +s_mul_hi_u32 s[sgprTemp3], DEPTHU, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_mul_i32 s[sgprTemp2], DEPTHU, s[sgprGSUSumIdx] // gsuOffset = DepthU*GSUSumIdx +s_branch label_GSUC_B_End +label_GSUC_B: +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], LOG2DEPTHU // s[LoopCounterL] = s[sgprSizesSum] / DEPTHU +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v0, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v0, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v1, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v0, v0, v1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v0, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v1, v0, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v1, s[sgprLoopCounterL], v1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v1, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v0, 1, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v1, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_readfirstlane_b32 s[sgprLoopCounterL], v0 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v1 // remainder +s_mul_i32 s[sgprTemp3], s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx +s_add_u32 s[sgprTemp2], 1, s[sgprLoopCounterL] // quotient+1 +s_add_u32 s[sgprTemp3], s[sgprTemp3], s[sgprGSUSumIdx+1] // quotient*GSUSumIdx+remainder +s_mul_i32 s[sgprTemp2], s[sgprTemp2], s[sgprGSUSumIdx] // (quotient+1)*GSUSumIdx +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cselect_b32 s[sgprTemp2], s[sgprTemp2], s[sgprTemp3] // (quotient+1)*GSUSumIdx if needed +s_mul_hi_u32 s[sgprTemp3], s[sgprTemp2], DEPTHU // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +s_mul_i32 s[sgprTemp2], s[sgprTemp2], DEPTHU // gsuOffset = DepthU*accumulatedNumOfLoopCounterL +label_GSUC_B_End: +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp2] // accum GsuOffset term to tilestart +s_addc_u32 s[sgprTemp1], s[sgprTemp1], s[sgprTemp3] // accum GsuOffset term to tilestart + +s_mul_hi_u32 s[sgprTensor2dSizeB+1], s[sgprStridesB+0], s[sgprSizesFree+1] +s_mul_i32 s[sgprTensor2dSizeB+0], s[sgprStridesB+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], s[sgprTemp1] +s_mul_hi_u32 s[sgprTemp3], s[sgprWorkGroup2], s[sgprStridesB+1] // Stride*WG +s_mul_i32 s[sgprTemp2], s[sgprWorkGroup2], s[sgprStridesB+1] // Stride*WG +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp2] // accum wg term to tilestart +s_addc_u32 s[sgprTemp1], s[sgprTemp1], s[sgprTemp3] // accum wg term to tilestart +// Set limit to use bytes fp16 = 1 +s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], LOG2BPE + +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 +s_lshl_b64 s[sgprTemp0:sgprTemp1], s[sgprTemp0:sgprTemp1], LOG2BPE +s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s[sgprTemp0] // SRD base = Address+ tileStart0 +s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s[sgprTemp1] // SRD base = Address+ tileStart1 +s_mov_b32 s[sgprSrdB+3], Srd127_96 // Set bits 127_96 in SRD + +s_and_b32 s80, s[sgprGSU], 0x3fff // Restore GSU +s_mul_i32 s80, s80, DEPTHU*BPE // GSU*DEPTHU*BPE +s_and_b32 s81, s[sgprGSU], 0x8000 // SCC = (GSUC == 1) ? +s_cselect_b32 s[sgprGlobalReadIncsB+0], DEPTHU*BPE, s80 //depthU*PEB + +/******************************************/ +/* Generate LDS A parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_A, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_A //x4 load +.set LOADxWAVES_K_A, 64/COALESCE_THREAD_A*GLWAVES +.set LOADxWAVES_K_A_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_A, WAVE_LDS_OFFSET_A*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrA], s[sgprGlWaveID], WAVE_LDS_OFFSET_A+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], s[sgprTemp0] +s_mov_b32 s[sgprLocalWriteAddrAori], s[sgprLocalWriteAddrA] + +s_cmp_ge_i32 s[sgprWaveID], 4 +s_cbranch_scc1 skip_MacWaveALdsR + +//get lds read addrA dwordx4 load, DepthU=128, ldspad=0 ldswarp=32 +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 3, v[vgprTemp0] // (0~3) 4 waves as a group +v_lshlrev_b32 v1, 10, v0 // (0~3) * 1024 +v_lshlrev_b32 v2, 5, v0 // (0~3) * 32, per wave 8 lds bank warp +v_add_u32 v0, v1, v2 // add +//v_add_u32 v0, v1, 0 // add + +v_and_b32 v1, 15, v[vgprTemp0] // 0~15 0~15 0~15 0~15 +v_lshrrev_b32 v1, 2, v1 // 0000 1111 ~ 3333 +v_lshlrev_b32 v1, 8, v1 // (0000 1111 ~ 3333) * 128 +v_add_u32 v0, v0, v1 // add + +v_lshrrev_b32 v1, 4, v[vgprTemp0] // (0~63) / 16 +v_lshlrev_b32 v1, 4, v1 // (00..00 11..11 ~ 33..33) * 16 +v_add_u32 v[vgprLocalReadAddrA], v0, v1 // add + +//lds wrap A + +v_mov_b32 v0, 0 +v_mov_b32 v1, 0 +s_mov_b32 s[sgprTemp0], 1024 +v_writelane_b32 v0, s[sgprTemp0], 47 +v_writelane_b32 v0, s[sgprTemp0], 63 + +v_writelane_b32 v1, s[sgprTemp0], 14 +v_writelane_b32 v1, s[sgprTemp0], 15 +v_writelane_b32 v1, s[sgprTemp0], 30 +v_writelane_b32 v1, s[sgprTemp0], 31 +v_writelane_b32 v1, s[sgprTemp0], 45 +v_writelane_b32 v1, s[sgprTemp0], 46 +v_writelane_b32 v1, s[sgprTemp0], 47 +v_writelane_b32 v1, s[sgprTemp0], 61 +v_writelane_b32 v1, s[sgprTemp0], 62 +v_writelane_b32 v1, s[sgprTemp0], 63 + +v_sub_u32 v[vgprLocalReadAddrA+1], v[vgprLocalReadAddrA], v0 // add +v_sub_u32 v[vgprLocalReadAddrA+2], v[vgprLocalReadAddrA], v1 // add + +s_lshl_b32 s[sgprTemp1], s[sgprWaveID_M], 12 +v_add_u32 v[vgprLocalReadAddrA], v[vgprLocalReadAddrA], s[sgprTemp1] +v_add_u32 v[vgprLocalReadAddrA+1], v[vgprLocalReadAddrA+1], s[sgprTemp1] +v_add_u32 v[vgprLocalReadAddrA+2], v[vgprLocalReadAddrA+2], s[sgprTemp1] + +skip_MacWaveALdsR: +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Generate LDS B parameters ... */ +/******************************************/ +.set WAVE_LDS_OFFSET_B, 64*16 //x4 load +.set WAVE_LDS_OFFSET, WAVE_LDS_OFFSET_B //x4 load +.set LOADxWAVES_K_B, 64/COALESCE_THREAD_B*GLWAVES +.set LOADxWAVES_K_B_LOG2, 5 +.set LOADxWAVES_LDS_OFFSET_B, WAVE_LDS_OFFSET_B*GLWAVES + +//Wrap Lds +s_mul_i32 s[sgprLocalWriteAddrB], s[sgprGlWaveID], WAVE_LDS_OFFSET_B+0 // can add lds pad +s_and_b32 s[sgprTemp0], s[sgprGlWaveID], 3 +s_mul_i32 s[sgprTemp0], s[sgprTemp0], 2 +//s_mul_i32 s[sgprTemp0], s[sgprTemp0], 0 +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], 16 +s_or_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], s[sgprTemp0] +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_B_OFFSET +s_mov_b32 s[sgprLocalWriteAddrBori], s[sgprLocalWriteAddrB] + +s_cmp_ge_i32 s[sgprWaveID], 4 +s_cbranch_scc1 skip_MacWaveBLdsR + +//get lds read addrB dwordx4 load, DepthU=128, ldspad=0 ldswarp=32 +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_and_b32 v0, 3, v[vgprTemp0] // (0~3) 4 waves as a group +v_lshlrev_b32 v1, 10, v0 // (0~3) * 1024 +v_lshlrev_b32 v2, 5, v0 // (0~3) * 32, per wave 8 lds bank warp +v_add_u32 v0, v1, v2 // add +//v_add_u32 v0, v1, 0 // add + +v_and_b32 v1, 15, v[vgprTemp0] // 0~15 0~15 0~15 0~15 +v_lshrrev_b32 v1, 2, v1 // 0000 1111 ~ 3333 +v_lshlrev_b32 v1, 8, v1 // (0000 1111 ~ 3333) * 128 +v_add_u32 v0, v0, v1 // add + +v_lshrrev_b32 v1, 4, v[vgprTemp0] // (0~63) / 16 +v_lshlrev_b32 v1, 4, v1 // (00..00 11..11 ~ 33..33) * 16 +v_add_u32 v[vgprLocalReadAddrB], v0, v1 // add + +//lds wrap B + +v_mov_b32 v0, 0 +v_mov_b32 v1, 0 +s_mov_b32 s[sgprTemp0], 1024 +v_writelane_b32 v0, s[sgprTemp0], 47 +v_writelane_b32 v0, s[sgprTemp0], 63 + +v_writelane_b32 v1, s[sgprTemp0], 14 +v_writelane_b32 v1, s[sgprTemp0], 15 +v_writelane_b32 v1, s[sgprTemp0], 30 +v_writelane_b32 v1, s[sgprTemp0], 31 +v_writelane_b32 v1, s[sgprTemp0], 45 +v_writelane_b32 v1, s[sgprTemp0], 46 +v_writelane_b32 v1, s[sgprTemp0], 47 +v_writelane_b32 v1, s[sgprTemp0], 61 +v_writelane_b32 v1, s[sgprTemp0], 62 +v_writelane_b32 v1, s[sgprTemp0], 63 + +v_add_u32 v[vgprLocalReadAddrB], LDS_B_OFFSET, v[vgprLocalReadAddrB] +v_sub_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB], v0 // add +v_sub_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB], v1 // add + +s_lshl_b32 s[sgprTemp1], s[sgprWaveID_N], 12 +v_add_u32 v[vgprLocalReadAddrB], v[vgprLocalReadAddrB], s[sgprTemp1] +v_add_u32 v[vgprLocalReadAddrB+1], v[vgprLocalReadAddrB+1], s[sgprTemp1] +v_add_u32 v[vgprLocalReadAddrB+2], v[vgprLocalReadAddrB+2], s[sgprTemp1] + + +skip_MacWaveBLdsR: +.set WAVE_LDS_OFFSET, UNDEF //x4 load + +/******************************************/ +/* Keep Sgpr Values for use later ... */ +/******************************************/ + +//store sgprs to keep value +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+0], laneSrdA0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+1], laneSrdA1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdA+2], laneSrdA2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+0], laneSrdB0 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+1], laneSrdB1 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprSrdB+2], laneSrdB2 +v_writelane_b32 v[vgprKeepSgprValue], s[sgprGlobalReadIncsA], laneIncA +v_writelane_b32 v[vgprKeepSgprValue], s[sgprGlobalReadIncsB], laneIncB + +/******************************************/ +/* Define Global Load... */ +/******************************************/ + +.macro GLOBAL_LOADA offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrA], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 lds +s_add_u32 m0, m0, 4096 +buffer_load_dwordx4 v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 lds + +.endm + +.macro GLOBAL_LOADB offset:req + +s_add_u32 m0, s[sgprLocalWriteAddrB], \offset +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 lds +s_add_u32 m0, m0, 4096 +buffer_load_dwordx4 v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 lds + +.endm + +/******************************************/ +/* Define Global Load adress Increase... */ +/******************************************/ + +.macro GLOBAL_INCA + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsA+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitA+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32 + +.endm + +.macro GLOBAL_INCB + +s_mov_b32 s[sgprTemp0], s[sgprGlobalReadIncsB+0] +s_mov_b32 s[sgprTemp1], 0 +s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s[sgprTemp1] +s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s[sgprTemp0] +s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s[sgprTemp1] +s_cmp_eq_u32 s[sgprShadowLimitB+1], 0 // are we within 2^32? +s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32 + +.endm + +/******************************************/ +/* Define LDS Load... */ +/******************************************/ + +.macro LDS_LOADA off:req + +ds_read_b128 v[vgprValuA_X0_I0+ 0 +0:vgprValuA_X0_I0+ 0 +3], v[vgprLocalReadAddrA + 0] offset:0 + 0 + \off +ds_read_b128 v[vgprValuA_X0_I0+ 0 +4:vgprValuA_X0_I0+ 0 +7], v[vgprLocalReadAddrA + 0] offset:64 + 0 + \off +ds_read_b128 v[vgprValuA_X0_I0+ 0 +8:vgprValuA_X0_I0+ 0 +11], v[vgprLocalReadAddrA + 1] offset:128 + 0 + \off +ds_read_b128 v[vgprValuA_X0_I0+ 0 +12:vgprValuA_X0_I0+ 0 +15], v[vgprLocalReadAddrA + 2] offset:192 + 0 + \off + +.endm + +.macro LDS_LOADB off:req + +ds_read_b128 v[vgprValuB_X0_I0+ 0 +0:vgprValuB_X0_I0+ 0 +3], v[vgprLocalReadAddrB + 0] offset:0 + 0 + \off +ds_read_b128 v[vgprValuB_X0_I0+ 0 +4:vgprValuB_X0_I0+ 0 +7], v[vgprLocalReadAddrB + 0] offset:64 + 0 + \off +ds_read_b128 v[vgprValuB_X0_I0+ 0 +8:vgprValuB_X0_I0+ 0 +11], v[vgprLocalReadAddrB + 1] offset:128 + 0 + \off +ds_read_b128 v[vgprValuB_X0_I0+ 0 +12:vgprValuB_X0_I0+ 0 +15], v[vgprLocalReadAddrB + 2] offset:192 + 0 + \off + +.endm + +s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], LOG2DEPTHU // s[sgprLoopCounterL] = s[sgprSizesSum+0] / DEPTHU +s_and_b32 s80, s[sgprGSU], 0x3fff // Restore GSU +s_cmp_eq_u32 s80, 1 // GSU == 1 ? +s_cbranch_scc1 label_GSU_1 // branch if GSU == 1 +s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff // Restore GSU +v_cvt_f32_u32 v0, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_rcp_iflag_f32 v0, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_f32_u32 v1, s[sgprLoopCounterL] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_f32 v0, v0, v1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cvt_u32_f32 v0, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mul_u32_u24 v1, v0, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_sub_u32 v1, s[sgprLoopCounterL], v1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_cmpx_eq_u32 exec, v1, s[sgprGSUSumIdx+1] // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_add_u32 v0, 1, v0 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_mov_b32 v1, 0 // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1] +s_mov_b64 exec, -1 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1] +v_readfirstlane_b32 s[sgprLoopCounterL], v0 // quotient +v_readfirstlane_b32 s[sgprGSUSumIdx+1], v1 // remainder +s_add_u32 s80, 1, s[sgprLoopCounterL] // tmp<-numIterMyWg+1 +s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1] // gsuSumIdx < numIterPerWgRemainder +s_cmov_b32 s[sgprLoopCounterL], s80 // numIterMyWg++ if needed +label_GSU_1: + + +/******************************************/ +/* Use Global Load Wave process ... */ +/******************************************/ +s_cmp_lt_u32 s[sgprWaveID], 4 +s_cbranch_scc1 SkipGL +s_min_u32 s[sgprLoopCntCommon], 3, s[sgprLoopCounterL] + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 PreLoad_END +PreLoad_BEGIN: + +GLOBAL_LOADA LDS_BLK_OFFSET*0 +GLOBAL_LOADB LDS_BLK_OFFSET*0 +GLOBAL_INCA +GLOBAL_INCB + +s_add_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 PreLoad_BEGIN +PreLoad_END: +s_waitcnt vmcnt(4) + +s_cmp_gt_i32 s[sgprLoopCounterL], 3 +s_cbranch_scc1 skip_vmcnt0 +s_waitcnt vmcnt(0) +skip_vmcnt0: +s_barrier +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoop_END +MainLoop_BEGIN: + +s_cmp_le_i32 s[sgprLoopCntCommon], 3 +s_cmov_b32 s[sgprSrdA+2], 0 +s_cmov_b32 s[sgprGlobalReadIncsA+0], 0 + +s_cmov_b32 s[sgprSrdB+2], 0 +s_cmov_b32 s[sgprGlobalReadIncsB+0], 0 + +GLOBAL_LOADA LDS_BLK_OFFSET*0 +GLOBAL_LOADB LDS_BLK_OFFSET*0 + +GLOBAL_INCA +GLOBAL_INCB + +s_add_u32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrA], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrAori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrA], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrA], s[sgprLocalWriteAddrAori] + +s_add_u32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrB], LDS_BLK_OFFSET_64Kmasked +s_add_u32 s[sgprTemp0], s[sgprLocalWriteAddrBori], s[sgprLDSMask] +s_cmp_ge_u32 s[sgprLocalWriteAddrB], s[sgprTemp0] +s_cmov_b32 s[sgprLocalWriteAddrB], s[sgprLocalWriteAddrBori] + +s_waitcnt vmcnt(4) +s_barrier + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoop_BEGIN +MainLoop_END: +s_endpgm +SkipGL: + +/******************************************/ +/* GL END */ +/******************************************/ + +/******************************************/ +/* Generate SrcD ... */ +/******************************************/ + +s_and_b32 s[sgprTemp2], s[sgprGSU], 0x3fff +s_cmp_gt_u32 s[sgprTemp2], 1 +s_cselect_b32 s[sgprTemp2], 2, 1 +s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1] // init SRD base address (upper) + other fields +s_mov_b32 s[sgprSrdC+0], s[sgprAddressC+0] // init SRD base address (lower) +s_mov_b32 s[sgprSrdC+1], s[sgprAddressC+1] // init SRD base address (upper) + other fields +s_mul_i32 s[sgprTemp0], s[sgprWorkGroup0], MT0 +s_mul_i32 s[sgprTemp1], s[sgprWorkGroup1], MT1 +s_mul_i32 s[sgprTemp1], s[sgprTemp1], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp2] +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 + +#s_mul_i32 s[sgprTemp0], s[sgprSizesFree+0], s[sgprSizesFree+1] +s_mul_i32 s[sgprTemp0], s[sgprStridesD+0], s[sgprSizesFree+1] +s_sub_u32 s[sgprTemp1], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprSrdD+2], s[sgprTemp1], s[sgprTemp2] +s_mov_b32 s[sgprSrdD+3], Srd127_96 // Set bits 127_96 in post-loop SRD +s_lshl_b32 s[sgprSrdC+2], s[sgprTemp1], s[sgprTemp2] +s_mov_b32 s[sgprSrdC+3], Srd127_96 // Set bits 127_96 in post-loop SRD + + +s_and_b32 s[sgprTemp3], s[sgprGSU], 0x3fff +s_cmp_gt_u32 s[sgprTemp3], 1 +s_cselect_b32 s[sgprTemp3], 2, 1 +v_and_b32 v[vgprTemp0], v[vgprSerial], 63 +v_lshrrev_b32 v[vgprTemp1], 4, v[vgprTemp0] +v_and_b32 v[vgprTemp2], 15, v[vgprTemp0] +v_lshlrev_b32 v[vgprTemp2], s[sgprTemp3], v[vgprTemp2] +v_mul_lo_u32 v[vgprTemp1], v[vgprTemp1], s[sgprStridesD] +v_lshlrev_b32 v[vgprTemp1], s[sgprTemp3], v[vgprTemp1] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], v[vgprTemp2] +s_mul_i32 s[sgprTemp1], s[sgprWaveID_M], MperWAVE +s_mul_i32 s[sgprTemp0], s[sgprWaveID_N], NperWAVE +s_mul_i32 s[sgprTemp0], s[sgprTemp0], s[sgprStridesD] +s_add_u32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp1] +s_lshl_b32 s[sgprTemp0], s[sgprTemp0], s[sgprTemp3] +v_add_u32 v[vgprGlobalWriteOffsetD], v[vgprGlobalWriteOffsetD], s[sgprTemp0] + +/******************************************/ +/* Init ValueC ... */ +/******************************************/ + +v_mov_b32 v[vgprValuC+0], 0x0 +v_mov_b32 v[vgprValuC+1], 0x0 +v_mov_b32 v[vgprValuC+2], 0x0 +v_mov_b32 v[vgprValuC+3], 0x0 + +/******************************************/ +/* LoopCounter == 0, Skip to tail/last loop ... */ +/******************************************/ +s_cmp_le_i32 s[sgprLoopCounterL], 0 +s_cbranch_scc1 TAIL_LOOP +s_barrier +LDS_LOADA 0 +LDS_LOADB 0 + + +/******************************************/ +/* Main Loop Process ... */ +/******************************************/ +s_mov_b32 s[sgprLoopCntCommon], s[sgprLoopCounterL] + +/* Unrolled Loop 1/4 - Begin */ + + +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3_END +MainLoopBeginW0_3_BEGIN: + +s_waitcnt lgkmcnt(0) +MMAC_16x16_part0_0 +s_barrier + +LDS_LOADA 16384 +LDS_LOADB 16384 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/* Unrolled Loop 2/4 - Begin */ + +s_waitcnt lgkmcnt(0) +MMAC_16x16_part0_0 +s_barrier + +LDS_LOADA 32768 +LDS_LOADB 32768 +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/* Unrolled Loop 3/4 - Begin */ + +s_waitcnt lgkmcnt(0) +MMAC_16x16_part0_0 +s_barrier + +LDS_LOADA 49152 +LDS_LOADB 49152 + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_le_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 TAIL_LOOP + +/* Unrolled Loop 4/4 - Begin */ + +s_waitcnt lgkmcnt(0) +MMAC_16x16_part0_0 +s_barrier + +LDS_LOADA 0 +LDS_LOADB 0 + + +s_sub_u32 s[sgprLoopCntCommon], s[sgprLoopCntCommon], 1 +s_cmp_gt_i32 s[sgprLoopCntCommon], 0 +s_cbranch_scc1 MainLoopBeginW0_3_BEGIN +MainLoopBeginW0_3_END: + +/******************************************/ +/* Tail Loop Process ... */ +/******************************************/ +TAIL_LOOP: + + +/******************************************/ +/* Global Write Process ... */ +/******************************************/ + +s_cmp_eq_u32 s[sgprBeta], 0 +s_cbranch_scc1 Beta_eqcase +s_endpgm +s_branch Beta_EndSwitch +Beta_eqcase: + +s_mul_i32 s[sgprD_MEdge], s[sgprWorkGroup0], MT0 +s_sub_u32 s[sgprD_MEdge], s[sgprSizesFree+0], s[sgprD_MEdge] +s_lshl_b32 s[sgprD_MEdge], s[sgprD_MEdge], 1 +s_min_u32 s[sgprD_MEdge], s[sgprD_MEdge], MT0*2 +v_and_b32 v[vgprTemp2], v[vgprSerial], 15 +s_mul_i32 s[sgprTemp1], s[sgprWaveID_M], MperWAVE +v_add_u32 v[vgprTemp2], v[vgprTemp2], s[sgprTemp1] +v_lshlrev_b32 v[vgprTemp2], 1, v[vgprTemp2] +v_mov_b32 v[vgprTemp3], v[vgprTemp2] //store inittial addr +v_mov_b32 v[vgprTemp0], v[vgprGlobalWriteOffsetD] //store inittial addr + +.set Nvoff, 0 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 32, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 32, v[vgprTemp2] + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 4*2 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 1 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 32, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 32, v[vgprTemp2] + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 4*2 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 2 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 32, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 32, v[vgprTemp2] + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 4*2 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +.set Nvoff, 3 +v_mov_b32 v[vgprTemp1], v[vgprTemp0] //v[vgprTemp1] <- v[vgprGlobalWriteOffsetD] with strideD +v_mov_b32 v[vgprTemp2], v[vgprTemp3] //v[vgprTemp2] is just M offset without strideD for compare edge +v_cmp_ge_u32 s[sgprTemp2:sgprTemp2+1], v[vgprTemp2], s[sgprD_MEdge] +v_cndmask_b32 v[vgprGlobalWriteOffsetD], v[vgprTemp1], -1, s[sgprTemp2:sgprTemp2+1] +v_mul_f32 v[vgprValuC+Nvoff+0], s[sgprAlpha], v[vgprValuC+Nvoff+0] +v_cvt_f16_f32 v[vgprValuC+Nvoff+0], v[vgprValuC+Nvoff+0] +buffer_store_short v[vgprValuC+Nvoff+0], v[vgprGlobalWriteOffsetD], s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D +v_add_u32 v[vgprTemp1], 32, v[vgprTemp1] +v_add_u32 v[vgprTemp2], 32, v[vgprTemp2] + + +.set Nvoff, UNDEF +s_mul_i32 s[sgprTemp0], s[sgprStridesD], 4*2 +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 +s_subb_u32 s[sgprSrdD+2], s[sgprSrdD+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdD+2], 0 +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s[sgprTemp0] +s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 +s_subb_u32 s[sgprSrdC+2], s[sgprSrdC+2], s[sgprTemp0] +s_cmov_b32 s[sgprSrdC+2], 0 + +Beta_EndSwitch: +s_endpgm diff --git a/clean.sh b/clean.sh new file mode 100644 index 0000000000000000000000000000000000000000..22e26c3bf969e27c892703e76096c57b437d2c87 --- /dev/null +++ b/clean.sh @@ -0,0 +1,7 @@ +#! /bin/bash +pip uninstall -y aiter +rm -rf aiter_meta/ +rm -rf aiter/jit/aiter_.so +rm -rf aiter/jit/module_* +rm -rf aiter/jit/build + diff --git a/csrc/ck_batched_gemm_a8w8/README.md b/csrc/ck_batched_gemm_a8w8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ee6d3c7bb4769a9be0fc34a4f35f801640942b9a --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/README.md @@ -0,0 +1,24 @@ +# CK batched_gemm a8w8 tune + +1. Install aiter: +`cd $aiter_path` +`python3 setup.py develop` + +2. Add GEMM shapes in `aiter/configs/a8w8_untuned_batched_gemm.csv` + |**B**|**M**|**N**|**K**| + |-----|-----|-----|-----| + |16 |128 |1536 |7168 | + +3. Start tuning: +Run the following cmd to start tuning, run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_a8w8_tune via jit: +`python3 csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_batched_gemm.csv -o aiter/configs/a8w8_tuned_batched_gemm.csv` +You can find the results of the tuning in `aiter/configs/a8w8_tuned_batched_gemm.csv`. + +4. Build tuned kernels and test: +Test the performance, modify the test instance in `op_tests/test_batched_gemm_a8w8.py` and run it, please wait a few minutes as it will build batched_gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_batched_gemm.csv` via jit: +`python3 op_tests/test_batched_gemm_a8w8.py` +If you have built batched_gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_a8w8.py`. It will rebuild kernels from `aiter/configs/a8w8_tuned_batched_gemm.csv`. + + +## More +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. \ No newline at end of file diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu new file mode 100644 index 0000000000000000000000000000000000000000..55e1b538cb3de4383a73076557584f7a0ea8e883 --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8.cu @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: MIT + + +#include "batched_gemm_a8w8_common.cuh" +#include "batched_gemm_a8w8_manifest.h" +#include "batched_gemm_a8w8_lookup.h" +#include + +using BatchedRowwiseKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, torch::Tensor &, + torch::Tensor &, std::optional, + int)>; + +// Define a custom hash function for std::tuple +struct IntTupleHash +{ + size_t operator()(const std::tuple &t) const + { + auto hash1 = std::hash{}(std::get<0>(t)); + auto hash2 = std::hash{}(std::get<1>(t)); + auto hash3 = std::hash{}(std::get<2>(t)); + auto hash4 = std::hash{}(std::get<3>(t)); + return hash1 ^ hash2 ^ hash3 ^ hash4; + } +}; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using BatchedRowwiseKernelMap = std::unordered_map< + std::tuple, + BatchedRowwiseKernel, + IntTupleHash>; + +template +BatchedRowwiseKernel batched_rowwise_heuristic_dispatch(int B, int M, int N, int K) +{ + // Apply shape heuristics to find a suitable kernel implementation. + if (M < 64 && N < 2048 && K < 2048) + { + // Kernel that generally works well on small shapes. + return a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2; + } + else if (M < 64 && K < 2048) + { + // Kernel that works well for small batch size and small K. + return a8w8_batched_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2; + } + else if (M < 64 && N < 2048) + { + // Kernel that works well for small batch size and small N. + return a8w8_batched_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2; + } + else if (M < 64 && N > 2048 && K > 2048) + { + // Kernel that works well for small M but larger N and K. + return a8w8_batched_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x16x1x4_4x4x1_1x1_intrawave_v1; + } + else if (M < 64) + { + // Fallback to generic small batch kernel if we cant find a good match. + return a8w8_batched_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2; + /* } else if (((M < 512 && K < 8192) || (N <= 2048 && K <= 8192) || (K <= 2048 && N <= 8192)) && K >= 1024) { + // Kernel that is optimized for larger batch sizes but otherwise small + // tensors. + return a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5; */ + } + else if (K < 1024) + { + // Special case for small K. + return a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1; + } + else if (M < 1024) + { + // Kernel for generic medium batch sizes. + return a8w8_batched_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3; + } + else if (M >= 1024 && N >= 1024 && K >= 1024) + { + // Kernel for very large gemm + // return a8w8_batched_rowwise_256x256x256x128_16x16_8x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3; + return a8w8_batched_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1; + } + else + { + // Fallback large kernel. + return a8w8_batched_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3; + } +} + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +BatchedRowwiseKernel batched_rowwise_dispatch(int B, int M, int N, int K) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + static const auto lookup = [] + { + if constexpr (std::is_same_v) { + return BatchedRowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,F16)}; + } else if constexpr (std::is_same_v) { + return BatchedRowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,B16)}; + } else { + static_assert(false, "batched_rowwise_dispatch used with unsupported dtype!"); + } + }(); + + // First check if this shape(M,N,K) is available in the direct lookup. + auto it = lookup.find({B, M, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + + int padded_m = M; + if (M > 1 && M <= 16) + { + padded_m = 16; + } + else if (M <= 16384) + { + padded_m = nextPow2(M); + } + else if (M <= 20480) + { + padded_m = 20480; + } + // Second check if this shape(padded_m,N,K) is available in the direct lookup. + it = lookup.find({B, padded_m, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return batched_rowwise_heuristic_dispatch(B, M, N, K); +} + +torch::Tensor batched_gemm_a8w8( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int splitK) +{ + TORCH_CHECK(XQ.dtype() == at::ScalarType::Char && XQ.dtype() == WQ.dtype(), + "Weights and activations should both be int8!"); + TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), + "Scales should have the same dtype!"); + if (bias != std::nullopt) + TORCH_CHECK(bias.value().dtype() == Y.dtype(), + "Out and bias should have the same dtype!"); + + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = XQ.size(2); + int KBatch = 1; + + if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) + { + batched_rowwise_dispatch(B, M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) + { + batched_rowwise_dispatch(B, M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (Y.dtype() == at::ScalarType::Half) + { + batched_rowwise_dispatch(B, M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (Y.dtype() == at::ScalarType::BFloat16) + { + batched_rowwise_dispatch(B, M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + return Y; +} diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_common.py b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_common.py new file mode 100644 index 0000000000000000000000000000000000000000..2aa01ce8347d47b2db3278a65dc9a9ca6f4c4f9f --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_common.py @@ -0,0 +1,1782 @@ +# SPDX-License-Identifier: MIT +from dataclasses import dataclass + + +@dataclass +class kernelInstance: + BLOCK_SIZE: int + MPerBLOCK: int + NPerBLOCK: int + KPerBLOCK: int + WAVE_TILE_M: int + WAVE_TILE_N: int + WAVE_MAP_M: int + WAVE_MAP_N: int + ABLOCK_TRANSFER: list[int] + BBLOCK_TRANSFER: list[int] + CBLOCK_TRANSFER: list[int] + CBLOCK_SPV: list[int] + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE: int + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE: int + LOOP_SCHED: str + PIPELINE_VERSION: int + + @property + def name(self) -> str: + return ("_").join( + [ + "a8w8_batched_rowwise", + ("x").join( + map( + lambda x: str(x), + [ + self.BLOCK_SIZE, + self.MPerBLOCK, + self.NPerBLOCK, + self.KPerBLOCK, + ], + ) + ), + ("x").join(map(lambda x: str(x), [self.WAVE_TILE_M, self.WAVE_TILE_N])), + ("x").join(map(lambda x: str(x), [self.WAVE_MAP_M, self.WAVE_MAP_N])), + ("x").join(map(lambda x: str(x), self.ABLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.BBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_SPV)), + ("x").join( + map( + lambda x: str(x), + [ + self.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + self.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + ], + ) + ), + self.LOOP_SCHED.lower(), + f"v{self.PIPELINE_VERSION}", + ] + ) + + +kernels_list = { + # id: kernel: BLOCK_SIZE| MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_MAP_M| WAVE_MAP_N| ABLOCK_TRANSFER| BBLOCK_TRANSFER| CBLOCK_TRANSFER| CBLOCK_SPV| CSHUFFLE_MX| CSHUFFLE_NX| LOOP_SCHED| PIPELINE_VERSION + 0: kernelInstance( + 256, + 256, + 256, + 64, + 32, + 32, + 4, + 4, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 1: kernelInstance( + 256, + 256, + 256, + 128, + 32, + 32, + 4, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 2: kernelInstance( + 256, + 256, + 224, + 128, + 32, + 32, + 2, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 3: kernelInstance( + 256, + 256, + 192, + 128, + 32, + 32, + 4, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 4: kernelInstance( + 256, + 256, + 160, + 128, + 32, + 32, + 2, + 5, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 5: kernelInstance( + 256, + 256, + 128, + 128, + 32, + 32, + 4, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 6: kernelInstance( + 256, + 256, + 96, + 128, + 32, + 32, + 2, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 7: kernelInstance( + 256, + 256, + 64, + 128, + 32, + 32, + 4, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 8: kernelInstance( + 256, + 128, + 256, + 128, + 32, + 32, + 2, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 9: kernelInstance( + 256, + 128, + 224, + 128, + 32, + 32, + 1, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 10: kernelInstance( + 256, + 128, + 192, + 128, + 32, + 32, + 2, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 11: kernelInstance( + 256, + 128, + 160, + 128, + 32, + 32, + 1, + 5, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 12: kernelInstance( + 256, + 128, + 128, + 256, + 32, + 32, + 2, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 13: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 14: kernelInstance( + 256, + 128, + 96, + 256, + 32, + 32, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 15: kernelInstance( + 256, + 128, + 64, + 256, + 32, + 32, + 2, + 1, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 16: kernelInstance( + 256, + 64, + 256, + 128, + 32, + 32, + 1, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 17: kernelInstance( + 256, + 64, + 224, + 128, + 16, + 16, + 2, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 18: kernelInstance( + 256, + 64, + 192, + 256, + 32, + 32, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 19: kernelInstance( + 256, + 64, + 192, + 128, + 32, + 32, + 1, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 20: kernelInstance( + 256, + 64, + 160, + 256, + 16, + 16, + 2, + 5, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 21: kernelInstance( + 256, + 64, + 128, + 256, + 32, + 32, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 22: kernelInstance( + 256, + 64, + 96, + 256, + 16, + 16, + 2, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 23: kernelInstance( + 256, + 64, + 64, + 512, + 32, + 32, + 1, + 1, + [32, 8, 1], + [32, 8, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 24: kernelInstance( + 256, + 32, + 256, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 25: kernelInstance( + 256, + 32, + 224, + 256, + 16, + 16, + 1, + 7, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 26: kernelInstance( + 256, + 32, + 192, + 256, + 16, + 16, + 1, + 6, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 27: kernelInstance( + 256, + 32, + 160, + 256, + 16, + 16, + 1, + 5, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 28: kernelInstance( + 256, + 32, + 128, + 256, + 32, + 32, + 1, + 1, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 29: kernelInstance( + 256, + 32, + 96, + 256, + 16, + 16, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 30: kernelInstance( + 256, + 32, + 64, + 512, + 16, + 16, + 1, + 2, + [32, 8, 1], + [32, 8, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 31: kernelInstance( + 256, + 16, + 256, + 128, + 16, + 16, + 1, + 4, + [16, 16, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 32: kernelInstance( + 256, + 16, + 192, + 256, + 16, + 16, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 33: kernelInstance( + 256, + 16, + 128, + 256, + 16, + 16, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 34: kernelInstance( + 256, + 16, + 64, + 512, + 16, + 16, + 1, + 1, + [32, 8, 1], + [32, 8, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 35: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 36: kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 37: kernelInstance( + 256, + 256, + 256, + 128, + 16, + 16, + 8, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 38: kernelInstance( + 256, + 256, + 256, + 64, + 16, + 16, + 8, + 8, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 39: kernelInstance( + 256, + 224, + 256, + 128, + 16, + 16, + 7, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 40: kernelInstance( + 256, + 256, + 224, + 128, + 16, + 16, + 8, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 41: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 5, + ), + 42: kernelInstance( + 256, + 128, + 256, + 64, + 32, + 32, + 2, + 4, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 43: kernelInstance( + 256, + 256, + 128, + 64, + 32, + 32, + 4, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 44: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 45: kernelInstance( + 256, + 128, + 64, + 128, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 46: kernelInstance( + 256, + 64, + 128, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 47: kernelInstance( + 256, + 64, + 64, + 128, + 32, + 32, + 1, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + # mem(Intrawave): Latency friendly + 48: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 1, + ), + 49: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + 50: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + # mem(Intrawave): Memory friendly, Col + 51: kernelInstance( + 256, + 256, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 52: kernelInstance( + 256, + 256, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 32, 1], + [8, 16, 1], + [1, 32, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 53: kernelInstance( + 128, + 128, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 54: kernelInstance( + 128, + 128, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 55: kernelInstance( + 128, + 64, + 32, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 56: kernelInstance( + 128, + 64, + 16, + 128, + 16, + 16, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 57: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 58: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [4, 16, 1], + [4, 16, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 59: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 60: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 61: kernelInstance( + 128, + 16, + 64, + 128, + 16, + 16, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 62: kernelInstance( + 128, + 32, + 64, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 2, + ), + 63: kernelInstance( + 128, + 16, + 128, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 64: kernelInstance( + 128, + 32, + 128, + 128, + 32, + 32, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 65: kernelInstance( + 256, + 16, + 256, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 66: kernelInstance( + 256, + 32, + 256, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 1, + "Intrawave", + 2, + ), + # mem(Interwave): Latency friendly + 67: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 1, + ), + 68: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 1, + ), + 69: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 1, + ), + # mem(Interwave): Memory friendly, Col + 70: kernelInstance( + 256, + 256, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 71: kernelInstance( + 256, + 256, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 32, 1], + [8, 16, 1], + [1, 32, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 72: kernelInstance( + 128, + 128, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 73: kernelInstance( + 128, + 128, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 74: kernelInstance( + 128, + 64, + 32, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 75: kernelInstance( + 128, + 64, + 16, + 128, + 16, + 16, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 76: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 77: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [4, 16, 1], + [4, 16, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 78: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 79: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 80: kernelInstance( + 128, + 16, + 64, + 128, + 16, + 16, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 81: kernelInstance( + 128, + 32, + 64, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 82: kernelInstance( + 128, + 16, + 128, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 83: kernelInstance( + 128, + 32, + 128, + 128, + 32, + 32, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 84: kernelInstance( + 256, + 16, + 256, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 85: kernelInstance( + 256, + 32, + 256, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), +} + + +default_kernels_dict = { + # ( M, N, K): kernel: BLOCK_SIZE| MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_MAP_M| WAVE_MAP_N| ABLOCK_TRANSFER| BBLOCK_TRANSFER| CBLOCK_TRANSFER| CBLOCK_SPV| CSHUFFLE_MX| CSHUFFLE_NX| LOOP_SCHED|PIPELINE_VERSION + (-1): kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + (-3): kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + (-4): kernelInstance( + 64, + 16, + 16, + 256, + 16, + 16, + 1, + 1, + [16, 4, 1], + [16, 4, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + (-5): kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + (-6): kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + (-7): kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + (-8): kernelInstance( + 256, + 256, + 128, + 64, + 32, + 32, + 4, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + (-9): kernelInstance( + 256, + 224, + 256, + 128, + 16, + 16, + 7, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + (-10): kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), +} diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.cu b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.cu new file mode 100644 index 0000000000000000000000000000000000000000..3adbdf4d58f53aa2612f6a6068ac3ebc16de1169 --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.cu @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: MIT + + +#include "batched_gemm_a8w8_common.cuh" +#include "batched_gemm_a8w8_manifest.h" +#include "batched_gemm_a8w8_lookup.h" +#include + +using BatchedRowwiseKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, torch::Tensor &, + torch::Tensor &, std::optional, + int)>; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using BatchedRowwiseKernelMap = std::unordered_map< + int, + BatchedRowwiseKernel>; + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +BatchedRowwiseKernel batched_rowwise_dispatch(int id) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + // First check if this shape is available in the direct lookup. + static const auto lookup = [] + { + if constexpr (std::is_same_v) { + return BatchedRowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,F16)}; + } else if constexpr (std::is_same_v) { + return BatchedRowwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,B16)}; + } else { + static_assert(false, "batched_rowwise_dispatch used with unsupported dtype!"); + } }(); + + TORCH_CHECK(id < lookup.size(), + "Kernel id " + std::to_string(id) +" is out of range!"); + auto it = lookup.find(id); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return lookup.find(0)->second; +} + + +torch::Tensor batched_gemm_a8w8_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + int kernelId, + int splitK) +{ + TORCH_CHECK(XQ.dtype() == at::ScalarType::Char && XQ.dtype() == WQ.dtype(), + "Weights and activations should both be int8!"); + TORCH_CHECK( x_scale.dtype() == w_scale.dtype(), + "Scales should have the same dtype!"); + std::optional bias = std::nullopt; + + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = XQ.size(2); + int KBatch = std::pow(2, splitK); + + // if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) + // { + // batched_rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias); + // } + // else if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) + // { + // batched_rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias); + // } + // else if (Y.dtype() == at::ScalarType::Half) + // { + // batched_rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias); + // } + // else + if (Y.dtype() == at::ScalarType::BFloat16) + { + batched_rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + return Y; +} diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py new file mode 100644 index 0000000000000000000000000000000000000000..659979db1648cd7f032407cc0247938b07370f89 --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py @@ -0,0 +1,204 @@ +# SPDX-License-Identifier: MIT +import os +import aiter +import pandas as pd +import torch +import torch.nn.functional as F +from aiter import dtypes +from aiter.test_common import perftest +from batched_gemm_a8w8_common import kernels_list +import argparse + + +def checkClose(a, b, rtol=1e-3, atol=0.01): + isClose = torch.isclose(a, b, rtol=rtol, atol=atol) + mask = ~isClose + if isClose.all(): + return True + else: + percent = (a[mask]).numel() / a.numel() + if percent > 0.01: + return False + else: + return True + + +def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=dtypes.bf16): + B = x.size(0) + M = x.size(1) + N = weight.size(1) + out = torch.empty(B, M, N, dtype=dtypes.bf16, device="cuda") + for b in range(B): + b_x = F.linear(x[b, :, :].to(dtypes.fp32), weight[b, :, :].to(dtypes.fp32)) + b_scale = torch.matmul(x_scale[b, :, :], w_scale[b, :, :]) + b_out = torch.mul(b_x, b_scale) + if bias is not None: + b_out = b_out.to(bias[b, :, :]) + bias[b, :, :] + out[b, :, :] = b_out + return out.to(dtype) + + +def get_untuned_batched_gemm_list(untuned_batched_gemm_file): + assert os.path.exists( + untuned_batched_gemm_file + ), f"Not exist a8w8_untuned_batched_gemm.csv file: {untuned_batched_gemm_file}" + untunedf = pd.read_csv(untuned_batched_gemm_file) + return untunedf + + +def get_tuned_batched_gemm_list(tuned_batched_gemm_file): + if os.path.exists(tuned_batched_gemm_file): + tunedf = pd.read_csv(tuned_batched_gemm_file) + else: + tunedf = pd.DataFrame( + columns=["B", "M", "N", "K", "kernelId", "splitK", "us", "kernelName"] + ) + return tunedf + + +@perftest() +def kernel_instance_test(x, weight, x_scale, w_scale, out, kernel_id, splitK=0): + aiter.batched_gemm_a8w8_tune(x, weight, x_scale, w_scale, out, kernel_id, splitK) + return out + + +def tune_batched_gemm(b, m, n, k, useSplitK=False): + dim = (b, m, n, k) + x = torch.randint(-20, 20, (b, m, k), dtype=dtypes.i8, device="cuda") + weight = torch.randint(-20, 20, (b, n, k), dtype=dtypes.i8, device="cuda") + x_scale = torch.rand([b, m, 1], dtype=dtypes.bf16, device="cuda") + w_scale = torch.rand([b, 1, n], dtype=dtypes.bf16, device="cuda") + out = torch.empty(b, m, n, dtype=dtypes.bf16, device="cuda") + + ref_out = run_torch(x, weight, x_scale, w_scale) + + print(f"*******************B:{b} X M:{m} X N:{n} X K:{k}**************************") + print(f"Start tuning a8w8 batched_gemm kernel for B:{b}, M:{m}, N:{n}, K{k}:") + kernels_num = len(kernels_list) + best_kernelConfig = (-1, 0) + best_time = -1 + for i in range(kernels_num): + kernel = kernels_list[i] + maxsplitK = ( + aiter.compute_batched_gemm_SplitK( + b, m, n, k, kernel.MPerBLOCK, kernel.NPerBLOCK, kernel.KPerBLOCK + ) + if useSplitK + else 0 + ) + for splitK in range(maxsplitK + 1): + try: + (out), avg_t = kernel_instance_test( + x, weight, x_scale, w_scale, out, i, splitK + ) + isClosed = checkClose(ref_out, out, rtol=1e-2, atol=0.01) + if isClosed: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t avg: {avg_t:<8.2f} us, {kernel.name}, {splitK=}" + ) + if best_time < 0 or avg_t < best_time: + best_kernelConfig = (i, splitK) + best_time = avg_t + else: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No pass , {kernel.name}, {splitK=}" + ) + except RuntimeError as e: + print(f"error = {e}") + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No support , {kernel.name}, {splitK=}" + ) + + best_kernelId, splitK = best_kernelConfig + if best_kernelConfig[0] == -1: + print(f"No kernel can be used for B{b}, M:{m}, N:{n}, K:{k}") + best_time = "nan" + else: + best_time = round(best_time, 4) + + print( + f"Tuning result for B:{b}, M:{m}, N:{n}, K:{k} is kernelId={best_kernelId} {kernels_list[best_kernelId].name} {splitK=}, {best_time}us" + ) + print(f"*******************B:{b} X M:{m} X N:{n} X K{k}**************************") + + return best_kernelId, splitK, best_time + + +def tune_batched_gemm_list(untunedf, tunedf, issorted=False, useSplitK=False): + for i in range(len(untunedf)): + B = untunedf.loc[i, "B"] + M = untunedf.loc[i, "M"] + N = untunedf.loc[i, "N"] + K = untunedf.loc[i, "K"] + + if tunedf[ + (tunedf["B"] == B) + & (tunedf["M"] == M) + & (tunedf["N"] == N) + & (tunedf["K"] == K) + ].empty: + kernelId, splitK, time = tune_batched_gemm(B, M, N, K, useSplitK) + kernelName = "None" if kernelId == -1 else kernels_list[kernelId].name + temp = pd.DataFrame( + { + "B": [B], + "M": [M], + "N": [N], + "K": [K], + "kernelId": [kernelId], + "splitK": [splitK], + "us": [time], + "kernelName": [kernelName], + } + ) + tunedf = pd.concat([tunedf, temp], ignore_index=True) + + else: + print(f"B:{B}, M:{M}, N:{N}, K{K} is in tuned batched_gemm, skip!!!") + print() + print() + if issorted: + tunedf = tunedf.sort_values(by=["B", "M", "N", "K"]) + print("Totall tuning result:") + print(tunedf) + return tunedf + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK batched_gemm a8w8 kernel", + ) + + parser.add_argument( + "-i", + "--untune_file", + default="aiter/configs/a8w8_untuned_batched_gemm.csv", + required=False, + help="input", + ) + + parser.add_argument( + "-o", + "--tune_file", + default="aiter/configs/a8w8_tuned_batched_gemm.csv", + required=False, + help="output: tuning result store this file", + ) + + parser.add_argument( + "-k", "--splitK", action="store_true", required=False, help="Use splitK kernels" + ) + + parser.add_argument( + "--sort", + action="store_true", + required=False, + help="Arranged according to the B M N K size", + ) + + args = parser.parse_args() + untunedf = get_untuned_batched_gemm_list(args.untune_file) + tunedf = get_tuned_batched_gemm_list(args.tune_file) + tunedf = tune_batched_gemm_list(untunedf, tunedf, args.sort, args.splitK) + tunedf.to_csv(args.tune_file, index=False) diff --git a/csrc/ck_batched_gemm_a8w8/gen_instances.py b/csrc/ck_batched_gemm_a8w8/gen_instances.py new file mode 100644 index 0000000000000000000000000000000000000000..c4117283c9b4efc6bd2a0f01ed0728383369c8c5 --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/gen_instances.py @@ -0,0 +1,318 @@ +# SPDX-License-Identifier: MIT +import os +from pathlib import Path +import pandas as pd +import argparse +import shutil +from batched_gemm_a8w8_common import kernelInstance, kernels_list, default_kernels_dict + + +class batched_gemm_a8w8_fwd_codegen: + def __init__(self, working_path, istune=False): + self.working_path = working_path + self.impl_path = os.path.join(working_path, "impl") + self.instances_path = os.path.join(working_path, "instances") + self.istune = istune + + def gen_instance(self, k: kernelInstance): + INSTANCE_IMPL = f"""// SPDX-License-Identifier: MIT + + +#include "batched_gemm_a8w8_common.cuh" + +template +torch::Tensor +{k.name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch) +{{{{ + // The smallest kernel we have available. Works well for memory bound shapes. + + // Check if this input needs to be padded. + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = WQ.size(2); + bool pad = (M % {k.MPerBLOCK} != 0) || (N % {k.NPerBLOCK} != 0) || (K % ({k.KPerBLOCK} * KBatch) != 0); + if (pad) + {{{{ + // pad + {{INSTANCE_CONTENT_pad}} + // pad + }}}} + else + {{{{ + // no pad + {{INSTANCE_CONTENT_nopad}} + // no pad + }}}} +}}}} + +""" + INSTANCE_CONTENT_bias = f""" + {{{{ + using DeviceGemmInstance = DeviceGemmHelper< + DDataType, EDataType, + {k.BLOCK_SIZE}, + {k.MPerBLOCK}, + {k.NPerBLOCK}, + {k.KPerBLOCK}, + {k.WAVE_TILE_M}, + {k.WAVE_TILE_N}, + {k.WAVE_MAP_M}, + {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + ck::BlockGemmPipelineScheduler::{k.LOOP_SCHED}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return batched_gemm_a8w8_rowwise_impl(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + }}}} +""" + INSTANCE_CONTENT_nobias = f"""using DeviceGemmInstance = DeviceGemmHelper< + DDataType, EDataType, + {k.BLOCK_SIZE}, + {k.MPerBLOCK}, + {k.NPerBLOCK}, + {k.KPerBLOCK}, + {k.WAVE_TILE_M}, + {k.WAVE_TILE_N}, + {k.WAVE_MAP_M}, + {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + ck::BlockGemmPipelineScheduler::{k.LOOP_SCHED}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return batched_gemm_a8w8_rowwise_impl(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); +""" + if self.istune: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="MNKPadding") + ), + INSTANCE_CONTENT_nopad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="Default") + ), + ) + else: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=INSTANCE_CONTENT_bias.format( + GemmSpec="MNKPadding" + ), + INSTANCE_CONTENT_nopad=INSTANCE_CONTENT_bias.format(GemmSpec="Default"), + ) + + Path(os.path.join(self.impl_path, f"{k.name}.cuh")).write_text( + INSTANCE_IMPL_str + ) + + INSTANCE_template = """// SPDX-License-Identifier: MIT + + +#include "{name}.cuh" + +template torch::Tensor +{name}<{dtypes}>( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch); + +""" + INSTANCE_dBF16_eBF16 = INSTANCE_template.format(name=k.name, dtypes="B16") + INSTANCE_dFP32_eBF16 = INSTANCE_template.format(name=k.name, dtypes="F32, B16") + INSTANCE_dFP16_eFP16 = INSTANCE_template.format(name=k.name, dtypes="F16") + INSTANCE_dFP32_eFP16 = INSTANCE_template.format(name=k.name, dtypes="F32, F16") + + if self.istune: + Path( + os.path.join(self.instances_path, f"{k.name}_dBF16_eBF16.cpp") + ).write_text(INSTANCE_dBF16_eBF16) + else: + Path( + os.path.join(self.instances_path, f"{k.name}_dBF16_eBF16.cpp") + ).write_text(INSTANCE_dBF16_eBF16) + Path( + os.path.join(self.instances_path, f"{k.name}_dFP32_eBF16.cpp") + ).write_text(INSTANCE_dFP32_eBF16) + Path( + os.path.join(self.instances_path, f"{k.name}_dFP16_eFP16.cpp") + ).write_text(INSTANCE_dFP16_eFP16) + Path( + os.path.join(self.instances_path, f"{k.name}_dFP32_eFP16.cpp") + ).write_text(INSTANCE_dFP32_eFP16) + + def gen_lookup_dict(self, kernels_dict): + LOOKUP_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#define GENERATE_LOOKUP_TABLE(DTYPE, ETYPE) \\ + { \\""" + + LOOKUP_template = """ + {{{mnk}, \\ + {kernel_name}}}, \\""" + + LOOKUP_end = """ + } + +#endif // USE_ROCM +""" + with open( + os.path.join(self.working_path, "batched_gemm_a8w8_lookup.h"), "w" + ) as f: + f.write(LOOKUP_head) + for mnk, k in kernels_dict.items(): + # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) + if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): + f.write( + LOOKUP_template.format( + mnk="{" + + (", ").join(map(lambda x: str(x), list(mnk))) + + "}", + kernel_name=k.name, + ) + ) + elif self.istune and isinstance(mnk, int): + f.write(LOOKUP_template.format(mnk=mnk, kernel_name=k.name)) + f.write(LOOKUP_end) + + def gen_manifest_head(self, kernels_dict): + MAINFEST_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#include + +#include +""" + MAINFEST_template = """ +template +torch::Tensor +{kernel_name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch); +""" + MAINFEST_end = """ + +#endif // USE_ROCM +""" + + with open( + os.path.join(self.working_path, "batched_gemm_a8w8_manifest.h"), "w" + ) as f: + f.write(MAINFEST_head) + for mnk, k in kernels_dict.items(): + f.write(MAINFEST_template.format(kernel_name=k.name)) + f.write(MAINFEST_end) + + def gen_instances(self, kernels_dict): + if os.path.exists(self.impl_path): + shutil.rmtree(self.impl_path) + os.mkdir(self.impl_path) + if os.path.exists(self.instances_path): + shutil.rmtree(self.instances_path) + os.mkdir(self.instances_path) + + for mnk, k in kernels_dict.items(): + self.gen_instance(k) + + self.gen_lookup_dict(kernels_dict) + self.gen_manifest_head(kernels_dict) + + +def get_tune_dict(tune_dict_csv): + tune_dict = default_kernels_dict + if os.path.exists(tune_dict_csv): + tune_df = pd.read_csv(tune_dict_csv) + for i in range(len(tune_df)): + B = tune_df.loc[i, "B"] + M = tune_df.loc[i, "M"] + N = tune_df.loc[i, "N"] + K = tune_df.loc[i, "K"] + kid = tune_df.loc[i, "kernelId"] + tune_dict[(B, M, N, K)] = kernels_list[kid] + return tune_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK batched gemm a8w8 kernel", + ) + + # the directory for list_blobs/gen_blobs to write files into + parser.add_argument( + "-w", + "--working_path", + default="./", + required=False, + help="the path where all the blobs are going to be generated", + ) + + parser.add_argument( + "-f", + "--tune_file", + default="aiter/configs/a8w8_tuned_batched_gemm.csv", + required=False, + help="tune_file include the result after run batched_gemm_a8w8_tune.py", + ) + + parser.add_argument( + "--tune", action="store_true", required=False, help="generated tune instances" + ) + + # parser.add_argument( + # "--out_type", + # default="all", + # required=False, + # help="Specifie the type of scale\n \ + # all: [bf16, fp16] \n \ + # bf16, fp16" + # ) + + # parser.add_argument( + # "--scale_type", + # default="all", + # required=False, + # help="Specifie the type of scale\n \ + # all: [fp32, same as out] \n \ + # same: [same as out]" + # ) + + args = parser.parse_args() + codegen = batched_gemm_a8w8_fwd_codegen(args.working_path, args.tune) + + if args.tune: + codegen.gen_instances(kernels_list) + else: + codegen.gen_instances(get_tune_dict(args.tune_file)) diff --git a/csrc/ck_batched_gemm_a8w8/include/batched_gemm_a8w8.h b/csrc/ck_batched_gemm_a8w8/include/batched_gemm_a8w8.h new file mode 100644 index 0000000000000000000000000000000000000000..1a45a97a577b8f5a594741cd90c7015fbfc3c171 --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/include/batched_gemm_a8w8.h @@ -0,0 +1,22 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include +#include +torch::Tensor batched_gemm_a8w8( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int splitK); + +torch::Tensor batched_gemm_a8w8_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + int kernelId, + int splitK); diff --git a/csrc/ck_batched_gemm_a8w8/include/batched_gemm_a8w8_common.cuh b/csrc/ck_batched_gemm_a8w8/include/batched_gemm_a8w8_common.cuh new file mode 100644 index 0000000000000000000000000000000000000000..441f30a7529f62a429708cd340a09f6a555aa4ae --- /dev/null +++ b/csrc/ck_batched_gemm_a8w8/include/batched_gemm_a8w8_common.cuh @@ -0,0 +1,314 @@ +#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#undef __HIP_NO_HALF_OPERATORS__ +#undef __HIP_NO_HALF_CONVERSIONS__ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#include "ck/library/utility/check_err.hpp" + +#include "ck/utility/blkgemmpipe_scheduler.hpp" + +template +using S = ck::Sequence; + +using I8 = int8_t; +using I32 = int; +using F16 = ck::half_t; +using B16 = ck::bhalf_t; +using FP8 = ck::f8_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using ADataType = I8; +using BDataType = I8; +using AccDataType = I32; +using CShuffleDataType = I32; +using ComputeDataType = I8; + +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using D1Layout = Col; +using D2Layout = Row; +using DsLayout = ck::Tuple; +using DsLayout2 = ck::Tuple; +using ELayout = Row; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; + +struct RowwiseScale +{ + template + __host__ __device__ constexpr void + operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const; + + template <> + __host__ __device__ constexpr void operator()( + F16 &e, const AccDataType &c, const F16 &d0, const F16 &d1) const + { + const F32 x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + B16 &e, const AccDataType &c, const B16 &d0, const B16 &d1) const + { + const F32 x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + F16 &e, const AccDataType &c, const F32 &d0, const F32 &d1) const + { + const F32 x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + B16 &e, const AccDataType &c, const F32 &d0, const F32 &d1) const + { + const F32 x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } +}; + +struct MultiplyMultiplyAdd +{ + template + __host__ __device__ constexpr void + operator()(E &e, const C &c, const D0 &d0, const D1 &d1, const D2 &d2) const; + + template <> + __host__ __device__ constexpr void operator()( + ck::half_t &e, const int &c, const float &d0, const float &d1, const ck::half_t &d2) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1) + ck::type_convert(d2); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + ck::bhalf_t &e, const int &c, const float &d0, const float &d1, const ck::bhalf_t &d2) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1) + ck::type_convert(d2); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + ck::half_t &e, const int &c, const ck::half_t &d0, const ck::half_t &d1, const ck::half_t &d2) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1) + ck::type_convert(d2); + + e = ck::type_convert(x0_f); + } + + template <> + __host__ __device__ constexpr void operator()( + ck::bhalf_t &e, const int &c, const ck::bhalf_t &d0, const ck::bhalf_t &d1, const ck::bhalf_t &d2) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1) + ck::type_convert(d2); + + e = ck::type_convert(x0_f); + } +}; + +using CDEElementOp = RowwiseScale; + +template +using DsDataType = ck::Tuple; + +#if 0 +template +using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 +// clang-format off +///######| ALayout| BLayout| DsLayout| ELayout| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +///######| | | | | Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| +///######| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| +///######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| +///###### RRR +/// < Row, Row, DsLayout, ELayout, ADataType, BDataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>; +///###### RCR + < Row, Col, DsLayout, ELayout, ADataType, BDataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>; +// clang-format on +#endif + +template < + typename DDataType, typename EDataType, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int WAVE_TILE_M, + int WAVE_TILE_N, + int WAVE_MAP_M, + int WAVE_MAP_N, + typename ABLOCK_TRANSFER, + typename BBLOCK_TRANSFER, + typename CBLOCK_TRANSFER, + typename CBLOCK_SPV, + int CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + int CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + ck::BlockGemmPipelineScheduler LOOP_SCHED, + ck::BlockGemmPipelineVersion PIPELINE_VERSION, + auto GEMM_SPEC = + ck::tensor_operation::device::GemmSpecialization::MNPadding> +using DeviceGemmHelper = + ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< + ALayout, + BLayout, + DsLayout, + ELayout, + ADataType, + BDataType, + DsDataType, + EDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CDEElementOp, + GEMM_SPEC, + BLOCK_SIZE, // Block Size + MBLOCK, // M per Block + NBLOCK, // N per Block + KBLOCK, // K per Block + KBLOCK / ABLOCK_TRANSFER{}.At(0), // AK1 + 16, // BK1 + WAVE_TILE_M, // M per Xdl + WAVE_TILE_N, // N per Xdl + WAVE_MAP_M, // Mxdl per Wave + WAVE_MAP_N, // Nxdl per Wave + ABLOCK_TRANSFER, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + KBLOCK / ABLOCK_TRANSFER{}.At(0), + KBLOCK / ABLOCK_TRANSFER{}.At(0), + 0, + BBLOCK_TRANSFER, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 16, + 16, + 0, + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + CBLOCK_TRANSFER, + CBLOCK_SPV, + LOOP_SCHED, + PIPELINE_VERSION, + ComputeDataType>; + +template +__forceinline__ torch::Tensor batched_gemm_a8w8_rowwise_impl( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch) +{ + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = XQ.size(2); + + int StrideA = K; + int StrideB = K; + int StrideE = N; + + int BatchStrideA = M * K; + int BatchStrideB = N * K; + int BatchStrideE = M * N; + + const at::cuda::OptionalCUDAGuard device_guard(device_of(XQ)); + auto device_gemm = DeviceGemmInstance{}; + auto invoker = device_gemm.MakeInvoker(); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{}; + + constexpr ck::index_t NumDTensor = DeviceGemmInstance::NumDTensor; + + auto argument = device_gemm.MakeArgument( + reinterpret_cast(XQ.data_ptr()), + reinterpret_cast(WQ.data_ptr()), + std::array{ + reinterpret_cast(w_scale.data_ptr()), + reinterpret_cast(x_scale.data_ptr())}, + reinterpret_cast(Y.data_ptr()), + M, + N, + K, + B, + StrideA, + StrideB, + std::array{0, 0}, + StrideE, + BatchStrideA, + BatchStrideB, + std::array{N, M}, + BatchStrideE, + a_element_op, + b_element_op, + cde_element_op); + + TORCH_CHECK(device_gemm.IsSupportedArgument(argument), "This GEMM is not supported!"); + + invoker.Run(argument, StreamConfig{at::cuda::getCurrentCUDAStream().stream()}); + return Y; +} + +#endif // USE_ROCM diff --git a/csrc/ck_batched_gemm_bf16/README.md b/csrc/ck_batched_gemm_bf16/README.md new file mode 100644 index 0000000000000000000000000000000000000000..88550ebdb53d6013325968f2bf69899d77af6786 --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/README.md @@ -0,0 +1,24 @@ +# CK batched_gemm bf16 tune + +1. Install aiter: +`cd $aiter_path` +`python3 setup.py develop` + +2. Add GEMM shapes in `aiter/configs/bf16_untuned_batched_gemm.csv` + |**B**|**M**|**N**|**K**| + |-----|-----|-----|-----| + |16 |128 |1536 |7168 | + + +3. Start tuning: +Run the following cmd to start tuning, run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_bf16_tune via jit: +`python3 csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py -i aiter/configs/bf16_untuned_batched_gemm.csv -o aiter/configs/bf16_tuned_batched_gemm.csv` +You can find the results of the tuning in `aiter/configs/bf16_tuned_batched_gemm.csv`. + +4. Build tuned kernels and test: +Test the performance, modify the test instance in `op_tests/test_batched_gemm_bf16.py` and run it, please wait a few minutes as it will build batched_gemm_bf16 tuned kernels in `aiter/configs/bf16_tuned_batched_gemm.csv` via jit: +`python3 op_tests/test_batched_gemm_bf16.py` +If you have built batched_gemm_bf16 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_bf16.py`. It will rebuild kernels from `aiter/configs/bf16_tuned_batched_gemm.csv`. + +## More +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build batched_gemm_bf16 kernels in tuned gemm csv by default. If you want to use the new result of batched_gemm_bf16_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu new file mode 100644 index 0000000000000000000000000000000000000000..2395480c89b4746ead70a47f94768d7316cd5eb8 --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16.cu @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: MIT + + +#include "batched_gemm_bf16_common.cuh" +#include "batched_gemm_bf16_manifest.h" +#include "batched_gemm_bf16_lookup.h" +#include + +using BatchedKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, std::optional, + int)>; + +// Define a custom hash function for std::tuple +struct IntTupleHash +{ + size_t operator()(const std::tuple &t) const + { + auto hash1 = std::hash{}(std::get<0>(t)); + auto hash2 = std::hash{}(std::get<1>(t)); + auto hash3 = std::hash{}(std::get<2>(t)); + auto hash4 = std::hash{}(std::get<3>(t)); + return hash1 ^ hash2 ^ hash3 ^ hash4; + } +}; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using BatchedKernelMap = std::unordered_map< + std::tuple, + BatchedKernel, + IntTupleHash>; + +BatchedKernel batched_heuristic_dispatch(int B, int M, int N, int K) +{ + // Apply shape heuristics to find a suitable kernel implementation. + if (M < 64 && N < 2048 && K < 2048) + { + // Kernel that generally works well on small shapes. + return bf16_batched_64x16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2; + } + else if (M < 64 && K < 2048) + { + // Kernel that works well for small batch size and small K. + return bf16_batched_128x16x32x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2; + } + else if (M < 64 && N < 2048) + { + // Kernel that works well for small batch size and small N. + return bf16_batched_128x32x16x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2; + } + else if (M < 64 && N > 2048 && K > 2048) + { + // Kernel that works well for small M but larger N and K. + return bf16_batched_64x16x16x128_16x16_1x1_16x4x1_16x4x1_1x16x1x4_4x4x1_1x1_intrawave_v1; + } + else if (M < 64) + { + // Fallback to generic small batch kernel if we cant find a good match. + return bf16_batched_64x16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2; + /* } else if (((M < 512 && K < 8192) || (N <= 2048 && K <= 8192) || (K <= 2048 && N <= 8192)) && K >= 1024) { + // Kernel that is optimized for larger batch sizes but otherwise small + // tensors. + return bf16_batched_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5; */ + } + else if (K < 1024) + { + // Special case for small K. + return bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1; + } + else if (M < 1024) + { + // Kernel for generic medium batch sizes. + return bf16_batched_256x128x128x64_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3; + } + else if (M >= 1024 && N >= 1024 && K >= 1024) + { + // Kernel for very large gemm + // return bf16_batched_256x256x256x128_16x16_8x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3; + return bf16_batched_256x256x128x32_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1; + } + else + { + // Fallback large kernel. + return bf16_batched_256x224x256x32_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3; + } +} + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +BatchedKernel batched_dispatch(int B, int M, int N, int K) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + static const auto lookup = [] + { + return BatchedKernelMap{GENERATE_LOOKUP_TABLE()}; + }(); + + // First check if this shape(M,N,K) is available in the direct lookup. + auto it = lookup.find({B, M, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + + int padded_m = M; + if (M > 1 && M <= 16) + { + padded_m = 16; + } + else if (M <= 16384) + { + padded_m = nextPow2(M); + } + else if (M <= 20480) + { + padded_m = 20480; + } + // Second check if this shape(padded_m,N,K) is available in the direct lookup. + it = lookup.find({B, padded_m, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return batched_heuristic_dispatch(B, M, N, K); +} + +torch::Tensor batched_gemm_bf16( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + std::optional bias, + int splitK) +{ + TORCH_CHECK(XQ.dtype() == at::ScalarType::BFloat16 && XQ.dtype() == WQ.dtype(), + "Weights and activations should both be bf16!"); + if (bias != std::nullopt) + TORCH_CHECK(bias.value().dtype() == Y.dtype(), + "Out and bias should have the same dtype!"); + + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = XQ.size(2); + int KBatch = 1; + + batched_dispatch(B, M, N, K)(XQ, WQ, Y, bias, KBatch); + + return Y; +} diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_common.py b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_common.py new file mode 100644 index 0000000000000000000000000000000000000000..9c0c2fecee0f3e5526ec85e5257764d0e1060829 --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_common.py @@ -0,0 +1,1782 @@ +# SPDX-License-Identifier: MIT +from dataclasses import dataclass + + +@dataclass +class kernelInstance: + BLOCK_SIZE: int + MPerBLOCK: int + NPerBLOCK: int + KPerBLOCK: int + WAVE_TILE_M: int + WAVE_TILE_N: int + WAVE_MAP_M: int + WAVE_MAP_N: int + ABLOCK_TRANSFER: list[int] + BBLOCK_TRANSFER: list[int] + CBLOCK_TRANSFER: list[int] + CBLOCK_SPV: list[int] + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE: int + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE: int + LOOP_SCHED: str + PIPELINE_VERSION: int + + @property + def name(self) -> str: + return ("_").join( + [ + "bf16_batched", + ("x").join( + map( + lambda x: str(x), + [ + self.BLOCK_SIZE, + self.MPerBLOCK, + self.NPerBLOCK, + self.KPerBLOCK, + ], + ) + ), + ("x").join(map(lambda x: str(x), [self.WAVE_TILE_M, self.WAVE_TILE_N])), + ("x").join(map(lambda x: str(x), [self.WAVE_MAP_M, self.WAVE_MAP_N])), + ("x").join(map(lambda x: str(x), self.ABLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.BBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_SPV)), + ("x").join( + map( + lambda x: str(x), + [ + self.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + self.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + ], + ) + ), + self.LOOP_SCHED.lower(), + f"v{self.PIPELINE_VERSION}", + ] + ) + + +kernels_list = { + # id: kernel: BLOCK_SIZE| MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_MAP_M| WAVE_MAP_N| ABLOCK_TRANSFER| BBLOCK_TRANSFER| CBLOCK_TRANSFER| CBLOCK_SPV| CSHUFFLE_MX| CSHUFFLE_NX| LOOP_SCHED| PIPELINE_VERSION + 0: kernelInstance( + 256, + 256, + 256, + 32, + 32, + 32, + 4, + 4, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 1: kernelInstance( + 256, + 256, + 256, + 32, + 32, + 32, + 4, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 2: kernelInstance( + 256, + 256, + 224, + 32, + 32, + 32, + 2, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 3: kernelInstance( + 256, + 256, + 192, + 32, + 32, + 32, + 4, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 4: kernelInstance( + 256, + 256, + 160, + 32, + 32, + 32, + 2, + 5, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 5: kernelInstance( + 256, + 256, + 128, + 64, + 32, + 32, + 4, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 6: kernelInstance( + 256, + 256, + 96, + 64, + 32, + 32, + 2, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 7: kernelInstance( + 256, + 256, + 64, + 64, + 32, + 32, + 4, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 8: kernelInstance( + 256, + 128, + 256, + 64, + 32, + 32, + 2, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 9: kernelInstance( + 256, + 128, + 224, + 64, + 32, + 32, + 1, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 10: kernelInstance( + 256, + 128, + 192, + 64, + 32, + 32, + 2, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 11: kernelInstance( + 256, + 128, + 160, + 64, + 32, + 32, + 1, + 5, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 12: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 13: kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 14: kernelInstance( + 256, + 128, + 96, + 128, + 32, + 32, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 15: kernelInstance( + 256, + 128, + 64, + 128, + 32, + 32, + 2, + 1, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 16: kernelInstance( + 256, + 64, + 256, + 64, + 32, + 32, + 1, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 17: kernelInstance( + 256, + 64, + 224, + 64, + 16, + 16, + 2, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 18: kernelInstance( + 256, + 64, + 192, + 128, + 32, + 32, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 19: kernelInstance( + 256, + 64, + 192, + 64, + 32, + 32, + 1, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 20: kernelInstance( + 256, + 64, + 160, + 128, + 16, + 16, + 2, + 5, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 21: kernelInstance( + 256, + 64, + 128, + 128, + 32, + 32, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 22: kernelInstance( + 256, + 64, + 96, + 128, + 16, + 16, + 2, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 23: kernelInstance( + 256, + 64, + 64, + 256, + 32, + 32, + 1, + 1, + [32, 8, 1], + [32, 8, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 24: kernelInstance( + 256, + 32, + 256, + 64, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 25: kernelInstance( + 256, + 32, + 224, + 128, + 16, + 16, + 1, + 7, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 26: kernelInstance( + 256, + 32, + 192, + 128, + 16, + 16, + 1, + 6, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 27: kernelInstance( + 256, + 32, + 160, + 128, + 16, + 16, + 1, + 5, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 28: kernelInstance( + 256, + 32, + 128, + 128, + 32, + 32, + 1, + 1, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 29: kernelInstance( + 256, + 32, + 96, + 128, + 16, + 16, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 30: kernelInstance( + 256, + 32, + 64, + 256, + 16, + 16, + 1, + 2, + [32, 8, 1], + [32, 8, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 31: kernelInstance( + 256, + 16, + 256, + 64, + 16, + 16, + 1, + 4, + [16, 16, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 32: kernelInstance( + 256, + 16, + 192, + 128, + 16, + 16, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 33: kernelInstance( + 256, + 16, + 128, + 128, + 16, + 16, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 34: kernelInstance( + 256, + 16, + 64, + 256, + 16, + 16, + 1, + 1, + [32, 8, 1], + [32, 8, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 35: kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 36: kernelInstance( + 256, + 128, + 128, + 32, + 32, + 32, + 2, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 37: kernelInstance( + 256, + 256, + 256, + 32, + 16, + 16, + 8, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 38: kernelInstance( + 256, + 256, + 256, + 32, + 16, + 16, + 8, + 8, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 39: kernelInstance( + 256, + 224, + 256, + 32, + 16, + 16, + 7, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 40: kernelInstance( + 256, + 256, + 224, + 32, + 16, + 16, + 8, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 41: kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 5, + ), + 42: kernelInstance( + 256, + 128, + 256, + 32, + 32, + 32, + 2, + 4, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 43: kernelInstance( + 256, + 256, + 128, + 32, + 32, + 32, + 4, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 44: kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 45: kernelInstance( + 256, + 128, + 64, + 64, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 46: kernelInstance( + 256, + 64, + 128, + 64, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 47: kernelInstance( + 256, + 64, + 64, + 64, + 32, + 32, + 1, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + # mem(Intrawave): Latency friendly + 48: kernelInstance( + 128, + 32, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 1, + ), + 49: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + 50: kernelInstance( + 128, + 16, + 32, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + # mem(Intrawave): Memory friendly, Col + 51: kernelInstance( + 256, + 256, + 32, + 64, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 52: kernelInstance( + 256, + 256, + 16, + 32, + 16, + 16, + 4, + 1, + [8, 32, 1], + [8, 16, 1], + [1, 32, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 53: kernelInstance( + 128, + 128, + 32, + 64, + 32, + 32, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 54: kernelInstance( + 128, + 128, + 16, + 64, + 16, + 16, + 4, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 55: kernelInstance( + 128, + 64, + 32, + 64, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 56: kernelInstance( + 128, + 64, + 16, + 64, + 16, + 16, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 57: kernelInstance( + 128, + 32, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 58: kernelInstance( + 64, + 16, + 16, + 32, + 16, + 16, + 1, + 1, + [4, 16, 1], + [4, 16, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 59: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 60: kernelInstance( + 128, + 16, + 32, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 61: kernelInstance( + 128, + 16, + 64, + 64, + 16, + 16, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 62: kernelInstance( + 128, + 32, + 64, + 64, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 2, + ), + 63: kernelInstance( + 128, + 16, + 128, + 64, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 64: kernelInstance( + 128, + 32, + 128, + 64, + 32, + 32, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 65: kernelInstance( + 256, + 16, + 256, + 64, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 66: kernelInstance( + 256, + 32, + 256, + 64, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 1, + "Intrawave", + 2, + ), + # mem(Interwave): Latency friendly + 67: kernelInstance( + 128, + 32, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 1, + ), + 68: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 1, + ), + 69: kernelInstance( + 128, + 16, + 32, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 1, + ), + # mem(Interwave): Memory friendly, Col + 70: kernelInstance( + 256, + 256, + 32, + 64, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 71: kernelInstance( + 256, + 256, + 16, + 32, + 16, + 16, + 4, + 1, + [8, 32, 1], + [8, 16, 1], + [1, 32, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 72: kernelInstance( + 128, + 128, + 32, + 64, + 32, + 32, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 73: kernelInstance( + 128, + 128, + 16, + 64, + 16, + 16, + 4, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 74: kernelInstance( + 128, + 64, + 32, + 64, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 75: kernelInstance( + 128, + 64, + 16, + 64, + 16, + 16, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 76: kernelInstance( + 128, + 32, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 77: kernelInstance( + 64, + 16, + 16, + 32, + 16, + 16, + 1, + 1, + [4, 16, 1], + [4, 16, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 78: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 79: kernelInstance( + 128, + 16, + 32, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 80: kernelInstance( + 128, + 16, + 64, + 64, + 16, + 16, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 81: kernelInstance( + 128, + 32, + 64, + 64, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 82: kernelInstance( + 128, + 16, + 128, + 64, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 83: kernelInstance( + 128, + 32, + 128, + 64, + 32, + 32, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 84: kernelInstance( + 256, + 16, + 256, + 64, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 85: kernelInstance( + 256, + 32, + 256, + 64, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), +} + + +default_kernels_dict = { + # ( M, N, K): kernel: BLOCK_SIZE| MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_MAP_M| WAVE_MAP_N| ABLOCK_TRANSFER| BBLOCK_TRANSFER| CBLOCK_TRANSFER| CBLOCK_SPV| CSHUFFLE_MX| CSHUFFLE_NX| LOOP_SCHED|PIPELINE_VERSION + (-1): kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + (-3): kernelInstance( + 128, + 32, + 16, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + (-4): kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [16, 4, 1], + [16, 4, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + (-5): kernelInstance( + 128, + 16, + 32, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + (-6): kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + (-7): kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + (-8): kernelInstance( + 256, + 256, + 128, + 32, + 32, + 32, + 4, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + (-9): kernelInstance( + 256, + 224, + 256, + 32, + 16, + 16, + 7, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + (-10): kernelInstance( + 128, + 16, + 32, + 64, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), +} diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.cu b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.cu new file mode 100644 index 0000000000000000000000000000000000000000..f8af02b843a49b26ffbcde331458dcb1de93fa52 --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.cu @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT + + +#include "batched_gemm_bf16_common.cuh" +#include "batched_gemm_bf16_manifest.h" +#include "batched_gemm_bf16_lookup.h" +#include + +using BatchedKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, std::optional, + int)>; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using BatchedKernelMap = std::unordered_map< + int, + BatchedKernel>; + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +BatchedKernel batched_dispatch(int id) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + // First check if this shape is available in the direct lookup. + static const auto lookup = [] + { + return BatchedKernelMap{GENERATE_LOOKUP_TABLE()}; + }(); + + TORCH_CHECK(id < lookup.size(), + "Kernel id " + std::to_string(id) +" is out of range!"); + auto it = lookup.find(id); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return lookup.find(0)->second; +} + + +torch::Tensor batched_gemm_bf16_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + int kernelId, + int splitK) +{ + TORCH_CHECK(XQ.dtype() == at::ScalarType::BFloat16 && XQ.dtype() == WQ.dtype(), + "Weights and activations should both be bf16!"); + std::optional bias = std::nullopt; + + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = XQ.size(2); + int KBatch = std::pow(2, splitK); + + if (Y.dtype() == at::ScalarType::BFloat16) + { + batched_dispatch(kernelId)(XQ, WQ, Y, bias, KBatch); + } + else + { + TORCH_CHECK(false, "Unsupported output dtype!"); + } + return Y; +} diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py new file mode 100644 index 0000000000000000000000000000000000000000..8e088a4837b5c6ab72a04239f5389287c0e271e0 --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: MIT +import os +import aiter +import pandas as pd +import torch +import torch.nn.functional as F +from aiter.test_common import perftest +from aiter import dtypes +from batched_gemm_bf16_common import kernels_list +import argparse + + +def checkClose(a, b, rtol=1e-3, atol=0.01): + isClose = torch.isclose(a, b, rtol=rtol, atol=atol) + mask = ~isClose + if isClose.all(): + return True + else: + percent = (a[mask]).numel() / a.numel() + if percent > 0.01: + return False + else: + return True + + +def run_torch(x, weight, bias=None, dtype=dtypes.bf16): + B = x.size(0) + M = x.size(1) + N = weight.size(1) + out = torch.empty(B, M, N, dtype=dtypes.bf16, device="cuda") + for b in range(B): + b_out = F.linear(x[b, :, :].to(dtypes.fp32), weight[b, :, :].to(dtypes.fp32)) + if bias is not None: + b_out = b_out.to(bias[b, :, :]) + bias[b, :, :] + out[b, :, :] = b_out + return out.to(dtype) + + +def get_untuned_batched_gemm_list(untuned_batched_gemm_file): + assert os.path.exists( + untuned_batched_gemm_file + ), f"Not exist bf16_untuned_batched_gemm.csv file: {untuned_batched_gemm_file}" + untunedf = pd.read_csv(untuned_batched_gemm_file) + return untunedf + + +def get_tuned_batched_gemm_list(tuned_batched_gemm_file): + if os.path.exists(tuned_batched_gemm_file): + tunedf = pd.read_csv(tuned_batched_gemm_file) + else: + tunedf = pd.DataFrame( + columns=["B", "M", "N", "K", "kernelId", "splitK", "us", "kernelName"] + ) + return tunedf + + +@perftest() +def kernel_instance_test(x, weight, out, kernel_id, splitK=0): + aiter.batched_gemm_bf16_tune(x, weight, out, kernel_id, splitK) + return out + + +def tune_batched_gemm(b, m, n, k, useSplitK=False): + dim = (b, m, n, k) + x = torch.randint(-20, 20, (b, m, k), dtype=dtypes.bf16, device="cuda") + weight = torch.randint(-20, 20, (b, n, k), dtype=dtypes.bf16, device="cuda") + out = torch.empty(b, m, n, dtype=dtypes.bf16, device="cuda") + + ref_out = run_torch(x, weight) + + print(f"*******************B:{b} X M:{m} X N:{n} X K:{k}**************************") + print(f"Start tuning bf16 batched_gemm kernel for B:{b}, M:{m}, N:{n}, K{k}:") + kernels_num = len(kernels_list) + best_kernelConfig = (-1, 0) + best_time = -1 + for i in range(kernels_num): + kernel = kernels_list[i] + maxsplitK = ( + aiter.compute_batched_gemm_SplitK( + b, m, n, k, kernel.MPerBLOCK, kernel.NPerBLOCK, kernel.KPerBLOCK + ) + if useSplitK + else 0 + ) + for splitK in range(maxsplitK + 1): + try: + (out), avg_t = kernel_instance_test(x, weight, out, i, splitK) + isClosed = checkClose(ref_out, out, rtol=1e-2, atol=0.01) + if isClosed: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t avg: {avg_t:<8.2f} us, {kernel.name}, {splitK=}" + ) + if best_time < 0 or avg_t < best_time: + best_kernelConfig = (i, splitK) + best_time = avg_t + else: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No pass , {kernel.name}, {splitK=}" + ) + except RuntimeError as e: + print(f"error = {e}") + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No support , {kernel.name}, {splitK=}" + ) + + best_kernelId, splitK = best_kernelConfig + if best_kernelConfig[0] == -1: + print(f"No kernel can be used for B{b}, M:{m}, N:{n}, K:{k}") + best_time = "nan" + else: + best_time = round(best_time, 4) + + print( + f"Tuning result for B:{b}, M:{m}, N:{n}, K:{k} is kernelId={best_kernelId} {kernel.name} {splitK=}, {best_time}us" + ) + print(f"*******************B:{b} X M:{m} X N:{n} X K{k}**************************") + + return best_kernelId, splitK, best_time + + +def tune_batched_gemm_list(untunedf, tunedf, issorted=False, useSplitK=False): + for i in range(len(untunedf)): + B = untunedf.loc[i, "B"] + M = untunedf.loc[i, "M"] + N = untunedf.loc[i, "N"] + K = untunedf.loc[i, "K"] + + if tunedf[ + (tunedf["B"] == B) + & (tunedf["M"] == M) + & (tunedf["N"] == N) + & (tunedf["K"] == K) + ].empty: + kernelId, splitK, time = tune_batched_gemm(B, M, N, K, useSplitK) + kernelName = "None" if kernelId == -1 else kernels_list[kernelId].name + temp = pd.DataFrame( + { + "B": [B], + "M": [M], + "N": [N], + "K": [K], + "kernelId": [kernelId], + "splitK": [splitK], + "us": [time], + "kernelName": [kernelName], + } + ) + tunedf = pd.concat([tunedf, temp], ignore_index=True) + + else: + print(f"B:{B}, M:{M}, N:{N}, K{K} is in tuned batched_gemm, skip!!!") + print() + print() + if issorted: + tunedf = tunedf.sort_values(by=["B", "M", "N", "K"]) + print("Totall tuning result:") + print(tunedf) + return tunedf + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK batched_gemm bf16 kernel", + ) + + parser.add_argument( + "-i", + "--untune_file", + default="aiter/configs/bf16_untuned_batched_gemm.csv", + required=False, + help="input", + ) + + parser.add_argument( + "-o", + "--tune_file", + default="aiter/configs/bf16_tuned_batched_gemm.csv", + required=False, + help="output: tuning result store this file", + ) + + parser.add_argument( + "-k", "--splitK", action="store_true", required=False, help="Use splitK kernels" + ) + + parser.add_argument( + "--sort", + action="store_true", + required=False, + help="Arranged according to the B M N K size", + ) + + args = parser.parse_args() + untunedf = get_untuned_batched_gemm_list(args.untune_file) + tunedf = get_tuned_batched_gemm_list(args.tune_file) + tunedf = tune_batched_gemm_list(untunedf, tunedf, args.sort, args.splitK) + tunedf.to_csv(args.tune_file, index=False) diff --git a/csrc/ck_batched_gemm_bf16/gen_instances.py b/csrc/ck_batched_gemm_bf16/gen_instances.py new file mode 100644 index 0000000000000000000000000000000000000000..c5641eee0e8db974893242f47a096d361f0ca6d2 --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/gen_instances.py @@ -0,0 +1,283 @@ +# SPDX-License-Identifier: MIT +import os +from pathlib import Path +import pandas as pd +import argparse +import shutil +from batched_gemm_bf16_common import kernelInstance, kernels_list, default_kernels_dict + + +class batched_gemm_bf16_fwd_codegen: + def __init__(self, working_path, istune=False): + self.working_path = working_path + self.impl_path = os.path.join(working_path, "impl") + self.instances_path = os.path.join(working_path, "instances") + self.istune = istune + + def gen_instance(self, k: kernelInstance): + INSTANCE_IMPL = f"""// SPDX-License-Identifier: MIT + + +#include "batched_gemm_bf16_common.cuh" + +torch::Tensor +{k.name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + std::optional bias, + int KBatch) +{{{{ + // The smallest kernel we have available. Works well for memory bound shapes. + +#if 0 + // Check if this input needs to be padded. + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = WQ.size(2); + bool pad = (M % {k.MPerBLOCK} != 0) || (N % {k.NPerBLOCK} != 0) || (K % ({k.KPerBLOCK} * KBatch) != 0); +#else + // Disable padding for packed tensor + bool pad = false; +#endif + if (pad) + {{{{ + // pad + {{INSTANCE_CONTENT_pad}} + // pad + }}}} + else + {{{{ + // no pad + {{INSTANCE_CONTENT_nopad}} + // no pad + }}}} +}}}} + +""" + INSTANCE_CONTENT_bias = f""" + {{{{ + using DeviceGemmInstance = DeviceGemmHelper< + {k.BLOCK_SIZE}, + {k.MPerBLOCK}, + {k.NPerBLOCK}, + {k.KPerBLOCK}, + {k.WAVE_TILE_M}, + {k.WAVE_TILE_N}, + {k.WAVE_MAP_M}, + {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + ck::BlockGemmPipelineScheduler::{k.LOOP_SCHED}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return batched_gemm_bf16_impl(XQ, WQ, Y, bias, KBatch); + }}}} +""" + INSTANCE_CONTENT_nobias = f"""using DeviceGemmInstance = DeviceGemmHelper< + {k.BLOCK_SIZE}, + {k.MPerBLOCK}, + {k.NPerBLOCK}, + {k.KPerBLOCK}, + {k.WAVE_TILE_M}, + {k.WAVE_TILE_N}, + {k.WAVE_MAP_M}, + {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + ck::BlockGemmPipelineScheduler::{k.LOOP_SCHED}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return batched_gemm_bf16_impl(XQ, WQ, Y, bias, KBatch); +""" + if self.istune: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="MNKPadding") + ), + INSTANCE_CONTENT_nopad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="Default") + ), + ) + else: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=INSTANCE_CONTENT_bias.format( + GemmSpec="MNKPadding" + ), + INSTANCE_CONTENT_nopad=INSTANCE_CONTENT_bias.format(GemmSpec="Default"), + ) + + Path(os.path.join(self.impl_path, f"{k.name}.cuh")).write_text( + INSTANCE_IMPL_str + ) + + INSTANCE_template = """// SPDX-License-Identifier: MIT + + +#include "{name}.cuh" + +torch::Tensor +{name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + std::optional bias, + int KBatch); + +""" + INSTANCE = INSTANCE_template.format(name=k.name) + + if self.istune: + Path(os.path.join(self.instances_path, f"{k.name}.cpp")).write_text( + INSTANCE + ) + else: + Path(os.path.join(self.instances_path, f"{k.name}.cpp")).write_text( + INSTANCE + ) + + def gen_lookup_dict(self, kernels_dict): + LOOKUP_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#define GENERATE_LOOKUP_TABLE() \\ + { \\""" + + LOOKUP_template = """ + {{{mnk}, \\ + {kernel_name}}}, \\""" + + LOOKUP_end = """ + } + +#endif // USE_ROCM +""" + with open( + os.path.join(self.working_path, "batched_gemm_bf16_lookup.h"), "w" + ) as f: + f.write(LOOKUP_head) + for mnk, k in kernels_dict.items(): + # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) + if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): + f.write( + LOOKUP_template.format( + mnk="{" + + (", ").join(map(lambda x: str(x), list(mnk))) + + "}", + kernel_name=k.name, + ) + ) + elif self.istune and isinstance(mnk, int): + f.write(LOOKUP_template.format(mnk=mnk, kernel_name=k.name)) + f.write(LOOKUP_end) + + def gen_manifest_head(self, kernels_dict): + MAINFEST_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#include + +#include +""" + MAINFEST_template = """ +torch::Tensor +{kernel_name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + std::optional bias, + int KBatch); +""" + MAINFEST_end = """ + +#endif // USE_ROCM +""" + + with open( + os.path.join(self.working_path, "batched_gemm_bf16_manifest.h"), "w" + ) as f: + f.write(MAINFEST_head) + for mnk, k in kernels_dict.items(): + f.write(MAINFEST_template.format(kernel_name=k.name)) + f.write(MAINFEST_end) + + def gen_instances(self, kernels_dict): + if os.path.exists(self.impl_path): + shutil.rmtree(self.impl_path) + os.mkdir(self.impl_path) + if os.path.exists(self.instances_path): + shutil.rmtree(self.instances_path) + os.mkdir(self.instances_path) + + for mnk, k in kernels_dict.items(): + self.gen_instance(k) + + self.gen_lookup_dict(kernels_dict) + self.gen_manifest_head(kernels_dict) + + +def get_tune_dict(tune_dict_csv): + tune_dict = default_kernels_dict + if os.path.exists(tune_dict_csv): + tune_df = pd.read_csv(tune_dict_csv) + for i in range(len(tune_df)): + B = tune_df.loc[i, "B"] + M = tune_df.loc[i, "M"] + N = tune_df.loc[i, "N"] + K = tune_df.loc[i, "K"] + kid = tune_df.loc[i, "kernelId"] + tune_dict[(B, M, N, K)] = kernels_list[kid] + return tune_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK batched gemm bf16 kernel", + ) + + # the directory for list_blobs/gen_blobs to write files into + parser.add_argument( + "-w", + "--working_path", + default="./", + required=False, + help="the path where all the blobs are going to be generated", + ) + + parser.add_argument( + "-f", + "--tune_file", + default="aiter/configs/bf16_tuned_batched_gemm.csv", + required=False, + help="tune_file include the result after run batched_gemm_bf16_tune.py", + ) + + parser.add_argument( + "--tune", action="store_true", required=False, help="generated tune instances" + ) + + args = parser.parse_args() + codegen = batched_gemm_bf16_fwd_codegen(args.working_path, args.tune) + + if args.tune: + codegen.gen_instances(kernels_list) + else: + codegen.gen_instances(get_tune_dict(args.tune_file)) diff --git a/csrc/ck_batched_gemm_bf16/include/batched_gemm_bf16.h b/csrc/ck_batched_gemm_bf16/include/batched_gemm_bf16.h new file mode 100644 index 0000000000000000000000000000000000000000..a94ab76a3ef1dce86d813d6ba58e6030a11ad045 --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/include/batched_gemm_bf16.h @@ -0,0 +1,18 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include +#include +torch::Tensor batched_gemm_bf16( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + std::optional bias, + int splitK); + +torch::Tensor batched_gemm_bf16_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + int kernelId, + int splitK); diff --git a/csrc/ck_batched_gemm_bf16/include/batched_gemm_bf16_common.cuh b/csrc/ck_batched_gemm_bf16/include/batched_gemm_bf16_common.cuh new file mode 100644 index 0000000000000000000000000000000000000000..275093f120034e3aad2bf8e36164014daaaaef7d --- /dev/null +++ b/csrc/ck_batched_gemm_bf16/include/batched_gemm_bf16_common.cuh @@ -0,0 +1,191 @@ +#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#undef __HIP_NO_HALF_OPERATORS__ +#undef __HIP_NO_HALF_CONVERSIONS__ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#include "ck/library/utility/check_err.hpp" + +#include "ck/utility/blkgemmpipe_scheduler.hpp" + +template +using S = ck::Sequence; + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using ADataType = BF16; +using BDataType = BF16; +using AccDataType = F32; +using CShuffleDataType = F32; +using ComputeDataType = BF16; +using EDataType = BF16; + +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using DsLayout = ck::Tuple<>; +using ELayout = Row; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = PassThrough; + +using DsDataType = ck::Tuple<>; + +template < + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int WAVE_TILE_M, + int WAVE_TILE_N, + int WAVE_MAP_M, + int WAVE_MAP_N, + typename ABLOCK_TRANSFER, + typename BBLOCK_TRANSFER, + typename CBLOCK_TRANSFER, + typename CBLOCK_SPV, + int CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + int CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + ck::BlockGemmPipelineScheduler LOOP_SCHED, + ck::BlockGemmPipelineVersion PIPELINE_VERSION, + auto GEMM_SPEC = + ck::tensor_operation::device::GemmSpecialization::MNPadding> +using DeviceGemmHelper = + ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< + ALayout, + BLayout, + DsLayout, + ELayout, + ADataType, + BDataType, + DsDataType, + EDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CDEElementOp, + GEMM_SPEC, + BLOCK_SIZE, // Block Size + MBLOCK, // M per Block + NBLOCK, // N per Block + KBLOCK, // K per Block + KBLOCK / ABLOCK_TRANSFER{}.At(0),// AK1 + KBLOCK / BBLOCK_TRANSFER{}.At(0),// AK1 + WAVE_TILE_M, // M per Xdl + WAVE_TILE_N, // N per Xdl + WAVE_MAP_M, // Mxdl per Wave + WAVE_MAP_N, // Nxdl per Wave + ABLOCK_TRANSFER, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + KBLOCK / ABLOCK_TRANSFER{}.At(0), + KBLOCK / ABLOCK_TRANSFER{}.At(0), + 0, + BBLOCK_TRANSFER, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + KBLOCK / BBLOCK_TRANSFER{}.At(0), + KBLOCK / BBLOCK_TRANSFER{}.At(0), + 0, + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + CBLOCK_TRANSFER, + CBLOCK_SPV, + LOOP_SCHED, + PIPELINE_VERSION, + ComputeDataType>; + +template +__forceinline__ torch::Tensor batched_gemm_bf16_impl( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &Y, + std::optional bias, + int KBatch) +{ + int B = XQ.size(0); + int M = XQ.size(1); + int N = WQ.size(1); + int K = XQ.size(2); + + int StrideA = K; + int StrideB = K; + int StrideE = N; + + int BatchStrideA = M * K; + int BatchStrideB = N * K; + int BatchStrideE = M * N; + + const at::cuda::OptionalCUDAGuard device_guard(device_of(XQ)); + auto device_gemm = DeviceGemmInstance{}; + auto invoker = device_gemm.MakeInvoker(); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{}; + + constexpr ck::index_t NumDTensor = DeviceGemmInstance::NumDTensor; + + auto argument = device_gemm.MakeArgument( + reinterpret_cast(XQ.data_ptr()), + reinterpret_cast(WQ.data_ptr()), + std::array{}, + reinterpret_cast(Y.data_ptr()), + M, + N, + K, + B, + StrideA, + StrideB, + std::array{}, + StrideE, + BatchStrideA, + BatchStrideB, + std::array{}, + BatchStrideE, + a_element_op, + b_element_op, + cde_element_op); + + TORCH_CHECK(device_gemm.IsSupportedArgument(argument), "This GEMM is not supported!"); + + invoker.Run(argument, StreamConfig{at::cuda::getCurrentCUDAStream().stream()}); + return Y; +} + +#endif // USE_ROCM diff --git a/csrc/ck_gemm_a8w8/README.md b/csrc/ck_gemm_a8w8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..861a1471f83992ed1c91d81b87944ac540d15154 --- /dev/null +++ b/csrc/ck_gemm_a8w8/README.md @@ -0,0 +1,30 @@ +# CK gemm a8w8 tune + +1. Install aiter: +`cd $aiter_path` +`python3 setup.py develop` + +2. Add GEMM shapes in `aiter/configs/a8w8_untuned_gemm.csv` + |**M**|**N**|**K**| + |-----|-----|-----| + |128 |1536 |7168 | + + +3. Start tuning: +Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_tune via jit: +`python3 csrc/ck_gemm_a8w8/gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_gemm.csv -o aiter/configs/a8w8_tuned_gemm.csv` +If you want to use split K kernels, you can add the `-k` parameter at the end, notice that should change `bias` to `bias/(2^k)`. +You can find the results of this tuning in `aiter/configs/a8w8_tuned_gemm.csv`, like this: + |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**| + |----------|-----|-----|-----|------------|----------|------|--------------| + |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx | + + `cu_num` means the number of compute units, and it is used to distinguish between graphics. + +4. Build tuned kernels and test: +Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_gemm.csv` via jit: +`python3 op_tests/test_gemm_a8w8.py` +If you have built gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `aiter/configs/a8w8_tuned_gemm.csv`. + +## More +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. \ No newline at end of file diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8.cu b/csrc/ck_gemm_a8w8/gemm_a8w8.cu new file mode 100644 index 0000000000000000000000000000000000000000..624428ce945146e8eeddfbae230e3aee59fb0b34 --- /dev/null +++ b/csrc/ck_gemm_a8w8/gemm_a8w8.cu @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: MIT + + +#include "gemm_a8w8_common.cuh" +#include "gemm_a8w8_manifest.h" +#include "gemm_a8w8_lookup.h" +#include +#include "py_itfs_common.h" + +using RowwiseKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, torch::Tensor &, + torch::Tensor &, std::optional, + int)>; + +// Define a custom hash function for std::tuple +struct IntTupleHash +{ + size_t operator()(const std::tuple &t) const + { + auto hash1 = std::hash{}(std::get<0>(t)); + auto hash2 = std::hash{}(std::get<1>(t)); + auto hash3 = std::hash{}(std::get<2>(t)); + return hash1 ^ hash2 ^ hash3; + } +}; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using RowwiseKernelMap = std::unordered_map< + std::tuple, + RowwiseKernel, + IntTupleHash>; + +template +RowwiseKernel rowwise_heuristic_dispatch(int M, int N, int K) +{ + // Apply shape heuristics to find a suitable kernel implementation. + if (M < 64 && N < 2048 && K < 2048) + { + // Kernel that generally works well on small shapes. + return a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2; + } + else if (M < 64 && K < 2048) + { + // Kernel that works well for small batch size and small K. + return a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2; + } + else if (M < 64 && N < 2048) + { + // Kernel that works well for small batch size and small N. + return a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2; + } + else if (M < 64 && N > 2048 && K > 2048) + { + // Kernel that works well for small M but larger N and K. + return a8w8_rowwise_64x16x16x256_16x16_1x1_16x4x1_16x4x1_1x16x1x4_4x4x1_1x1_intrawave_v1; + } + else if (M < 64) + { + // Fallback to generic small batch kernel if we cant find a good match. + return a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2; + /* } else if (((M < 512 && K < 8192) || (N <= 2048 && K <= 8192) || (K <= 2048 && N <= 8192)) && K >= 1024) { + // Kernel that is optimized for larger batch sizes but otherwise small + // tensors. + return a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5; */ + } + else if (K < 1024) + { + // Special case for small K. + return a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_interwave_v1; + } + else if (M < 1024) + { + // Kernel for generic medium batch sizes. + return a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3; + } + else if (M >= 1024 && N >= 1024 && K >= 1024) + { + // Kernel for very large gemm + // return a8w8_rowwise_256x256x256x128_16x16_8x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3; + return a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1; + } + else + { + // Fallback large kernel. + return a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3; + } +} + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +RowwiseKernel rowwise_dispatch(int M, int N, int K) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + static const auto lookup = [] + { + return RowwiseKernelMap{GENERATE_LOOKUP_TABLE(ABDataType, DDataType, EDataType)}; + }(); + + // First check if this shape(M,N,K) is available in the direct lookup. + auto it = lookup.find({M, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + + int padded_m = M; + if (M > 1 && M <= 16) + { + padded_m = 16; + } + else if (M <= 16384) + { + padded_m = nextPow2(M); + } + else if (M <= 20480) + { + padded_m = 20480; + } + // Second check if this shape(padded_m,N,K) is available in the direct lookup. + it = lookup.find({padded_m, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return rowwise_heuristic_dispatch(M, N, K); +} + +torch::Tensor gemm_a8w8( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int splitK) +{ + TORCH_CHECK((XQ.dtype() == at::ScalarType::Char || XQ.dtype() == torch_fp8) && + XQ.dtype() == WQ.dtype(), + "Weights and activations should both be int8/fp8!"); + TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), + "Scales should have the same dtype!"); + if (bias != std::nullopt) + TORCH_CHECK(bias.value().dtype() == Y.dtype(), + "Out and bias should have the same dtype!"); + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + int KBatch = std::pow(2, splitK); + + if (XQ.dtype() == at::ScalarType::Char) + { + if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (Y.dtype() == at::ScalarType::Half) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (Y.dtype() == at::ScalarType::BFloat16) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + } + else + { + if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (Y.dtype() == at::ScalarType::Half) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else if (Y.dtype() == at::ScalarType::BFloat16) + { + rowwise_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + } + return Y; +} diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8_common.py b/csrc/ck_gemm_a8w8/gemm_a8w8_common.py new file mode 100644 index 0000000000000000000000000000000000000000..23a8324ddc50c4a5963be26852cfad8923da1adb --- /dev/null +++ b/csrc/ck_gemm_a8w8/gemm_a8w8_common.py @@ -0,0 +1,1782 @@ +# SPDX-License-Identifier: MIT +from dataclasses import dataclass + + +@dataclass +class kernelInstance: + BLOCK_SIZE: int + MPerBLOCK: int + NPerBLOCK: int + KPerBLOCK: int + WAVE_TILE_M: int + WAVE_TILE_N: int + WAVE_MAP_M: int + WAVE_MAP_N: int + ABLOCK_TRANSFER: list[int] + BBLOCK_TRANSFER: list[int] + CBLOCK_TRANSFER: list[int] + CBLOCK_SPV: list[int] + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE: int + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE: int + LOOP_SCHED: str + PIPELINE_VERSION: int + + @property + def name(self) -> str: + return ("_").join( + [ + "a8w8_rowwise", + ("x").join( + map( + lambda x: str(x), + [ + self.BLOCK_SIZE, + self.MPerBLOCK, + self.NPerBLOCK, + self.KPerBLOCK, + ], + ) + ), + ("x").join(map(lambda x: str(x), [self.WAVE_TILE_M, self.WAVE_TILE_N])), + ("x").join(map(lambda x: str(x), [self.WAVE_MAP_M, self.WAVE_MAP_N])), + ("x").join(map(lambda x: str(x), self.ABLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.BBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_SPV)), + ("x").join( + map( + lambda x: str(x), + [ + self.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + self.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + ], + ) + ), + self.LOOP_SCHED.lower(), + f"v{self.PIPELINE_VERSION}", + ] + ) + + +kernels_list = { + # id: kernel: BLOCK_SIZE| MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_MAP_M| WAVE_MAP_N| ABLOCK_TRANSFER| BBLOCK_TRANSFER| CBLOCK_TRANSFER| CBLOCK_SPV| CSHUFFLE_MX| CSHUFFLE_NX| LOOP_SCHED| PIPELINE_VERSION + 0: kernelInstance( + 256, + 256, + 256, + 64, + 32, + 32, + 4, + 4, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 1: kernelInstance( + 256, + 256, + 256, + 128, + 32, + 32, + 4, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 2: kernelInstance( + 256, + 256, + 224, + 128, + 32, + 32, + 2, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 3: kernelInstance( + 256, + 256, + 192, + 128, + 32, + 32, + 4, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 4: kernelInstance( + 256, + 256, + 160, + 128, + 32, + 32, + 2, + 5, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 5: kernelInstance( + 256, + 256, + 128, + 128, + 32, + 32, + 4, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 6: kernelInstance( + 256, + 256, + 96, + 128, + 32, + 32, + 2, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 7: kernelInstance( + 256, + 256, + 64, + 128, + 32, + 32, + 4, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 8: kernelInstance( + 256, + 128, + 256, + 128, + 32, + 32, + 2, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 9: kernelInstance( + 256, + 128, + 224, + 128, + 32, + 32, + 1, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 10: kernelInstance( + 256, + 128, + 192, + 128, + 32, + 32, + 2, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 11: kernelInstance( + 256, + 128, + 160, + 128, + 32, + 32, + 1, + 5, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 12: kernelInstance( + 256, + 128, + 128, + 256, + 32, + 32, + 2, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 13: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 14: kernelInstance( + 256, + 128, + 96, + 256, + 32, + 32, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 15: kernelInstance( + 256, + 128, + 64, + 256, + 32, + 32, + 2, + 1, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 16: kernelInstance( + 256, + 64, + 256, + 128, + 32, + 32, + 1, + 4, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 17: kernelInstance( + 256, + 64, + 224, + 128, + 16, + 16, + 2, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 18: kernelInstance( + 256, + 64, + 192, + 256, + 32, + 32, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 19: kernelInstance( + 256, + 64, + 192, + 128, + 32, + 32, + 1, + 3, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 20: kernelInstance( + 256, + 64, + 160, + 256, + 16, + 16, + 2, + 5, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 21: kernelInstance( + 256, + 64, + 128, + 256, + 32, + 32, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 22: kernelInstance( + 256, + 64, + 96, + 256, + 16, + 16, + 2, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 23: kernelInstance( + 256, + 64, + 64, + 512, + 32, + 32, + 1, + 1, + [32, 8, 1], + [32, 8, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 24: kernelInstance( + 256, + 32, + 256, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 25: kernelInstance( + 256, + 32, + 224, + 256, + 16, + 16, + 1, + 7, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 26: kernelInstance( + 256, + 32, + 192, + 256, + 16, + 16, + 1, + 6, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 27: kernelInstance( + 256, + 32, + 160, + 256, + 16, + 16, + 1, + 5, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 28: kernelInstance( + 256, + 32, + 128, + 256, + 32, + 32, + 1, + 1, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 29: kernelInstance( + 256, + 32, + 96, + 256, + 16, + 16, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 30: kernelInstance( + 256, + 32, + 64, + 512, + 16, + 16, + 1, + 2, + [32, 8, 1], + [32, 8, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 31: kernelInstance( + 256, + 16, + 256, + 128, + 16, + 16, + 1, + 4, + [16, 16, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 32: kernelInstance( + 256, + 16, + 192, + 256, + 16, + 16, + 1, + 3, + [16, 16, 1], + [16, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 33: kernelInstance( + 256, + 16, + 128, + 256, + 16, + 16, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 34: kernelInstance( + 256, + 16, + 64, + 512, + 16, + 16, + 1, + 1, + [32, 8, 1], + [32, 8, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Intrawave", + 3, + ), + 35: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 36: kernelInstance( + 256, + 128, + 128, + 64, + 32, + 32, + 2, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 4, + ), + 37: kernelInstance( + 256, + 256, + 256, + 128, + 16, + 16, + 8, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 38: kernelInstance( + 256, + 256, + 256, + 64, + 16, + 16, + 8, + 8, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 39: kernelInstance( + 256, + 224, + 256, + 128, + 16, + 16, + 7, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + 40: kernelInstance( + 256, + 256, + 224, + 128, + 16, + 16, + 8, + 7, + [8, 32, 1], + [8, 32, 1], + [1, 64, 1, 4], + [8, 8, 1], + 2, + 1, + "Intrawave", + 3, + ), + 41: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 5, + ), + 42: kernelInstance( + 256, + 128, + 256, + 64, + 32, + 32, + 2, + 4, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 43: kernelInstance( + 256, + 256, + 128, + 64, + 32, + 32, + 4, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 44: kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + 45: kernelInstance( + 256, + 128, + 64, + 128, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 46: kernelInstance( + 256, + 64, + 128, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + 47: kernelInstance( + 256, + 64, + 64, + 128, + 32, + 32, + 1, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + # mem(Intrawave): Latency friendly + 48: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 1, + ), + 49: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + 50: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + # mem(Intrawave): Memory friendly, Col + 51: kernelInstance( + 256, + 256, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 52: kernelInstance( + 256, + 256, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 32, 1], + [8, 16, 1], + [1, 32, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 53: kernelInstance( + 128, + 128, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 54: kernelInstance( + 128, + 128, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 55: kernelInstance( + 128, + 64, + 32, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 56: kernelInstance( + 128, + 64, + 16, + 128, + 16, + 16, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 57: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + 58: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [4, 16, 1], + [4, 16, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 59: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 60: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 61: kernelInstance( + 128, + 16, + 64, + 128, + 16, + 16, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), + 62: kernelInstance( + 128, + 32, + 64, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 2, + ), + 63: kernelInstance( + 128, + 16, + 128, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 64: kernelInstance( + 128, + 32, + 128, + 128, + 32, + 32, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 65: kernelInstance( + 256, + 16, + 256, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 66: kernelInstance( + 256, + 32, + 256, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 1, + "Intrawave", + 2, + ), + # mem(Interwave): Latency friendly + 67: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 1, + ), + 68: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 1, + ), + 69: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 1, + ), + # mem(Interwave): Memory friendly, Col + 70: kernelInstance( + 256, + 256, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 71: kernelInstance( + 256, + 256, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 32, 1], + [8, 16, 1], + [1, 32, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 72: kernelInstance( + 128, + 128, + 32, + 128, + 32, + 32, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 73: kernelInstance( + 128, + 128, + 16, + 128, + 16, + 16, + 4, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 74: kernelInstance( + 128, + 64, + 32, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 75: kernelInstance( + 128, + 64, + 16, + 128, + 16, + 16, + 2, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 76: kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + 77: kernelInstance( + 64, + 16, + 16, + 64, + 16, + 16, + 1, + 1, + [4, 16, 1], + [4, 16, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 78: kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 79: kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 80: kernelInstance( + 128, + 16, + 64, + 128, + 16, + 16, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 81: kernelInstance( + 128, + 32, + 64, + 128, + 32, + 32, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 82: kernelInstance( + 128, + 16, + 128, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 83: kernelInstance( + 128, + 32, + 128, + 128, + 32, + 32, + 1, + 2, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), + 84: kernelInstance( + 256, + 16, + 256, + 128, + 16, + 16, + 1, + 4, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 16], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + 85: kernelInstance( + 256, + 32, + 256, + 128, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 16, 1, 16], + [8, 8, 1], + 1, + 1, + "Interwave", + 2, + ), +} + + +default_kernels_dict = { + # ( M, N, K): kernel: BLOCK_SIZE| MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_MAP_M| WAVE_MAP_N| ABLOCK_TRANSFER| BBLOCK_TRANSFER| CBLOCK_TRANSFER| CBLOCK_SPV| CSHUFFLE_MX| CSHUFFLE_NX| LOOP_SCHED|PIPELINE_VERSION + (-1): kernelInstance( + 64, + 16, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 8, 1], + [8, 8, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Interwave", + 2, + ), + (-3): kernelInstance( + 128, + 32, + 16, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Interwave", + 2, + ), + (-4): kernelInstance( + 64, + 16, + 16, + 256, + 16, + 16, + 1, + 1, + [16, 4, 1], + [16, 4, 1], + [1, 16, 1, 4], + [4, 4, 1], + 1, + 1, + "Intrawave", + 1, + ), + (-5): kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [2, 2, 1], + 1, + 1, + "Intrawave", + 2, + ), + (-6): kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + (-7): kernelInstance( + 256, + 128, + 128, + 128, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Intrawave", + 3, + ), + (-8): kernelInstance( + 256, + 256, + 128, + 64, + 32, + 32, + 4, + 2, + [4, 64, 1], + [4, 64, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 1, + "Interwave", + 1, + ), + (-9): kernelInstance( + 256, + 224, + 256, + 128, + 16, + 16, + 7, + 8, + [8, 32, 1], + [8, 32, 1], + [1, 32, 1, 8], + [8, 8, 1], + 1, + 2, + "Intrawave", + 3, + ), + (-10): kernelInstance( + 128, + 16, + 32, + 128, + 16, + 16, + 1, + 1, + [8, 16, 1], + [8, 16, 1], + [1, 16, 1, 8], + [4, 4, 1], + 1, + 1, + "Intrawave", + 2, + ), +} diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu new file mode 100644 index 0000000000000000000000000000000000000000..58cd91432d336129d9161a4b06eac7288b26b5ef --- /dev/null +++ b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.cu @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: MIT + + +#include "gemm_a8w8_common.cuh" +#include "gemm_a8w8_manifest.h" +#include "gemm_a8w8_lookup.h" +#include + +using RowwiseKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, torch::Tensor &, + torch::Tensor &, std::optional, + int)>; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using RowwiseKernelMap = std::unordered_map< + int, + RowwiseKernel>; + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +RowwiseKernel rowwise_dispatch(int id) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + // First check if this shape is available in the direct lookup. + static const auto lookup = [] + { + return RowwiseKernelMap{GENERATE_LOOKUP_TABLE(ABDataType, DDataType, EDataType)}; + }(); + + TORCH_CHECK(id < lookup.size(), + "Kernel id " + std::to_string(id) +" is out of range!"); + auto it = lookup.find(id); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return lookup.find(0)->second; +} + + +torch::Tensor gemm_a8w8_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + int kernelId, + int splitK) +{ + TORCH_CHECK(XQ.dtype() == at::ScalarType::Char && XQ.dtype() == WQ.dtype(), + "Weights and activations should both be int8!"); + TORCH_CHECK( x_scale.dtype() == w_scale.dtype(), + "Scales should have the same dtype!"); + std::optional bias = std::nullopt; + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + int KBatch = std::pow(2, splitK); + + // if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) + // { + // rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias); + // } + // else if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) + // { + // rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias); + // } + // else if (Y.dtype() == at::ScalarType::Half) + // { + // rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias); + // } + // else + if (Y.dtype() == at::ScalarType::BFloat16) + { + rowwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + return Y; +} diff --git a/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py new file mode 100644 index 0000000000000000000000000000000000000000..62f3dea11a638bf6a373d7f285f2f4bbdac0e2ac --- /dev/null +++ b/csrc/ck_gemm_a8w8/gemm_a8w8_tune.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: MIT + import os +import aiter +import pandas as pd +import torch +import torch.nn.functional as F +from aiter import dtypes +from aiter.test_common import perftest +from gemm_a8w8_common import kernels_list +import argparse + + +def checkClose(a, b, rtol=1e-3, atol=0.01): + isClose = torch.isclose(a, b, rtol=rtol, atol=atol) + mask = ~isClose + if isClose.all(): + return True + else: + percent = (a[mask]).numel() / a.numel() + if percent > 0.01: + return False + else: + return True + + +def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=dtypes.bf16): + x = F.linear(x.to(dtypes.fp32), weight.to(dtypes.fp32)) + scale = torch.matmul(x_scale, w_scale) + out = torch.mul(x, scale) + if bias is not None: + out = out.to(bias) + bias + return out.to(dtype) + + +def get_untuned_gemm_list(untuned_gemm_file): + assert os.path.exists( + untuned_gemm_file + ), f"Not exist a8w8_untuned_gemm.csv file: {untuned_gemm_file}" + untunedf = pd.read_csv(untuned_gemm_file) + return untunedf + + +def get_tuned_gemm_list(tuned_gemm_file): + if os.path.exists(tuned_gemm_file): + tunedf = pd.read_csv(tuned_gemm_file) + else: + tunedf = pd.DataFrame( + columns=["M", "N", "K", "kernelId", "splitK", "us", "kernelName"] + ) + return tunedf + + +@perftest() +def kernel_instance_test(x, weight, x_scale, w_scale, out, kernel_id, splitK=0): + aiter.gemm_a8w8_tune(x, weight, x_scale, w_scale, out, kernel_id, splitK) + return out + + +def tune_gemm(m, n, k, useSplitK=False): + dim = (m, n, k) + x = torch.randint(-20, 20, (m, k), dtype=dtypes.i8, device="cuda") + weight = torch.randint(-20, 20, (n, k), dtype=dtypes.i8, device="cuda") + x_scale = torch.rand([m, 1], dtype=dtypes.bf16, device="cuda") + w_scale = torch.rand([1, n], dtype=dtypes.bf16, device="cuda") + out = torch.empty(m, n, dtype=dtypes.bf16, device="cuda") + + ref_out = run_torch(x, weight, x_scale, w_scale) + + print(f"*******************M:{m} X N:{n} X K:{k}**************************") + print(f"Start tuning a8w8 gemm kernel for M:{m}, N:{n}, K{k}:") + kernels_num = len(kernels_list) + best_kernelConfig = (-1, 0) + best_time = -1 + for i in range(kernels_num): + kernel = kernels_list[i] + maxsplitK = ( + aiter.compute_gemm_SplitK( + m, n, k, kernel.MPerBLOCK, kernel.NPerBLOCK, kernel.KPerBLOCK + ) + if useSplitK + else 0 + ) + for splitK in range(maxsplitK + 1): + try: + (out), avg_t = kernel_instance_test( + x, weight, x_scale, w_scale, out, i, splitK + ) + isClosed = checkClose(ref_out, out, rtol=1e-2, atol=0.01) + if isClosed: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t avg: {avg_t:<8.2f} us, {kernel.name}, {splitK=}" + ) + if best_time < 0 or avg_t < best_time: + best_kernelConfig = (i, splitK) + best_time = avg_t + else: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No pass , {kernel.name}, {splitK=}" + ) + except RuntimeError: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No support , {kernel.name}, {splitK=}" + ) + + best_kernelId, splitK = best_kernelConfig + if best_kernelConfig[0] == -1: + print(f"No kernel can be used for M:{m}, N:{n}, K:{k}") + best_time = "nan" + else: + best_time = round(best_time, 4) + + print( + f"Tuning result for M:{m}, N:{n}, K:{k} is kernelId={best_kernelId} {kernels_list[best_kernelId].name} {splitK=}, {best_time}us" + ) + print(f"*******************M:{m} X N:{n} X K{k}**************************") + + return best_kernelId, splitK, best_time + + +def tune_gemm_list(untunedf, tunedf, issorted=False, useSplitK=False): + for i in range(len(untunedf)): + M = untunedf.loc[i, "M"] + N = untunedf.loc[i, "N"] + K = untunedf.loc[i, "K"] + + if tunedf[(tunedf["M"] == M) & (tunedf["N"] == N) & (tunedf["K"] == K)].empty: + kernelId, splitK, time = tune_gemm(M, N, K, useSplitK) + kernelName = "None" if kernelId == -1 else kernels_list[kernelId].name + temp = pd.DataFrame( + { + "M": [M], + "N": [N], + "K": [K], + "kernelId": [kernelId], + "splitK": [splitK], + "us": [time], + "kernelName": [kernelName], + } + ) + tunedf = pd.concat([tunedf, temp], ignore_index=True) + + else: + print(f"M:{M}, N:{N}, K{K} is in tuned gemm, skip!!!") + print() + print() + if issorted: + tunedf = tunedf.sort_values(by=["M", "N", "K"]) + print("Totall tuning result:") + print(tunedf) + return tunedf + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK gemm a8w8 kernel", + ) + + parser.add_argument( + "-i", + "--untune_file", + default="aiter/configs/a8w8_untuned_gemm.csv", + required=False, + help="input", + ) + + parser.add_argument( + "-o", + "--tune_file", + default="aiter/configs/a8w8_tuned_gemm.csv", + required=False, + help="output: tuning result store this file", + ) + + parser.add_argument( + "-k", "--splitK", action="store_true", required=False, help="Use splitK kernels" + ) + + parser.add_argument( + "--sort", + action="store_true", + required=False, + help="Arranged according to the M N K size", + ) + + args = parser.parse_args() + untunedf = get_untuned_gemm_list(args.untune_file) + tunedf = get_tuned_gemm_list(args.tune_file) + tunedf = tune_gemm_list(untunedf, tunedf, args.sort, args.splitK) + tunedf.to_csv(args.tune_file, index=False) diff --git a/csrc/ck_gemm_a8w8/gen_instances.py b/csrc/ck_gemm_a8w8/gen_instances.py new file mode 100644 index 0000000000000000000000000000000000000000..55772f25cbdb023ee22ca85f08fa552dd3bc6470 --- /dev/null +++ b/csrc/ck_gemm_a8w8/gen_instances.py @@ -0,0 +1,344 @@ +# SPDX-License-Identifier: MIT +import os +from pathlib import Path +import pandas as pd +import argparse +import shutil +from gemm_a8w8_common import kernelInstance, kernels_list, default_kernels_dict + + +class gemm_a8w8_fwd_codegen: + def __init__(self, working_path, istune=False): + self.working_path = working_path + self.impl_path = os.path.join(working_path, "impl") + self.instances_path = os.path.join(working_path, "instances") + self.istune = istune + + def gen_instance(self, k: kernelInstance): + INSTANCE_IMPL = f"""// SPDX-License-Identifier: MIT + + +#include "gemm_a8w8_common.cuh" + +template +torch::Tensor +{k.name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch) +{{{{ + // The smallest kernel we have available. Works well for memory bound shapes. + + // Check if this input needs to be padded. + int M = size_to_dim_(XQ.dim() - 1, XQ.sizes()); + int N = WQ.size(0); + int K = WQ.size(1); + bool pad = (M % {k.MPerBLOCK} != 0) || (N % {k.NPerBLOCK} != 0) || (K % ({k.KPerBLOCK} * KBatch) != 0); + using AccDataType = std::conditional_t, I32, F32>; + if (pad) + {{{{ + // pad + {{INSTANCE_CONTENT_pad}} + // pad + }}}} + else + {{{{ + // no pad + {{INSTANCE_CONTENT_nopad}} + // no pad + }}}} +}}}} + +""" + INSTANCE_CONTENT_bias = f"""if (bias != std::nullopt) + {{{{ + using DeviceGemmInstance = DeviceGemmHelper< + ABDataType, + AccDataType, + DDataType, EDataType, + MultiplyMultiplyAdd, + {k.BLOCK_SIZE}, + {k.MPerBLOCK}, + {k.NPerBLOCK}, + {k.KPerBLOCK}, + {k.WAVE_TILE_M}, + {k.WAVE_TILE_N}, + {k.WAVE_MAP_M}, + {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}, {k.CBLOCK_SPV[0]}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + ck::BlockGemmPipelineScheduler::{k.LOOP_SCHED}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return gemm_a8w8_rowwise_impl(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + }}}} + else + {{{{ + using DeviceGemmInstance = DeviceGemmHelper< + ABDataType, + AccDataType, + DDataType, EDataType, + RowwiseScale, + {k.BLOCK_SIZE}, + {k.MPerBLOCK}, + {k.NPerBLOCK}, + {k.KPerBLOCK}, + {k.WAVE_TILE_M}, + {k.WAVE_TILE_N}, + {k.WAVE_MAP_M}, + {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + ck::BlockGemmPipelineScheduler::{k.LOOP_SCHED}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return gemm_a8w8_rowwise_impl(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); + }}}} +""" + INSTANCE_CONTENT_nobias = f"""using DeviceGemmInstance = DeviceGemmHelper< + ABDataType, + AccDataType, + DDataType, EDataType, + RowwiseScale, + {k.BLOCK_SIZE}, + {k.MPerBLOCK}, + {k.NPerBLOCK}, + {k.KPerBLOCK}, + {k.WAVE_TILE_M}, + {k.WAVE_TILE_N}, + {k.WAVE_MAP_M}, + {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + ck::BlockGemmPipelineScheduler::{k.LOOP_SCHED}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return gemm_a8w8_rowwise_impl(XQ, WQ, x_scale, w_scale, Y, bias, KBatch); +""" + if self.istune: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="MNKPadding") + ), + INSTANCE_CONTENT_nopad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="Default") + ), + ) + else: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=INSTANCE_CONTENT_bias.format( + GemmSpec="MNKPadding" + ), + INSTANCE_CONTENT_nopad=INSTANCE_CONTENT_bias.format(GemmSpec="Default"), + ) + + Path(os.path.join(self.impl_path, f"{k.name}.cuh")).write_text( + INSTANCE_IMPL_str + ) + + INSTANCE_template = """// SPDX-License-Identifier: MIT + + +#include "{name}.cuh" + +template torch::Tensor +{name}<{dtypes}>( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch); + +""" + if self.istune: + INSTANCE_abI8_dBF16_eBF16 = INSTANCE_template.format( + name=k.name, dtypes="I8, B16" + ) + Path( + os.path.join(self.instances_path, f"{k.name}_abI8_dB16_eB16.cpp") + ).write_text(INSTANCE_abI8_dBF16_eBF16) + else: + for EDtype in ["B16", "F16"]: + for ABDtype in ["I8", "F8"]: + for DDtype in ["F32", EDtype]: + intsance = INSTANCE_template.format( + name=k.name, dtypes=f"{ABDtype}, {DDtype}, {EDtype}" + ) + Path( + os.path.join( + self.instances_path, + f"{k.name}_ab{ABDtype}_d{DDtype}_e{EDtype}.cpp", + ) + ).write_text(intsance) + + def gen_lookup_dict(self, kernels_dict): + LOOKUP_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#define GENERATE_LOOKUP_TABLE(ABTYPE, DTYPE, ETYPE) \\ + { \\""" + + LOOKUP_template = """ + {{{MNK}, \\ + {kernel_name}}}, \\""" + + LOOKUP_end = """ + } + +#endif // USE_ROCM +""" + with open(os.path.join(self.working_path, "gemm_a8w8_lookup.h"), "w") as f: + f.write(LOOKUP_head) + for mnk, k in kernels_dict.items(): + # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) + if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): + f.write( + LOOKUP_template.format( + MNK="{" + + (", ").join(map(lambda x: str(x), list(mnk))) + + "}", + kernel_name=k.name, + ) + ) + elif self.istune and isinstance(mnk, int): + f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) + f.write(LOOKUP_end) + + def gen_manifest_head(self, kernels_dict): + MAINFEST_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#include + +#include +""" + MAINFEST_template = """ +template +torch::Tensor +{kernel_name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch); +""" + MAINFEST_end = """ + +#endif // USE_ROCM +""" + + with open(os.path.join(self.working_path, "gemm_a8w8_manifest.h"), "w") as f: + f.write(MAINFEST_head) + for mnk, k in kernels_dict.items(): + f.write(MAINFEST_template.format(kernel_name=k.name)) + f.write(MAINFEST_end) + + def gen_instances(self, kernels_dict): + if os.path.exists(self.impl_path): + shutil.rmtree(self.impl_path) + os.mkdir(self.impl_path) + if os.path.exists(self.instances_path): + shutil.rmtree(self.instances_path) + os.mkdir(self.instances_path) + + for mnk, k in kernels_dict.items(): + self.gen_instance(k) + + self.gen_lookup_dict(kernels_dict) + self.gen_manifest_head(kernels_dict) + + +def get_tune_dict(tune_dict_csv): + tune_dict = default_kernels_dict + if os.path.exists(tune_dict_csv): + tune_df = pd.read_csv(tune_dict_csv) + for i in range(len(tune_df)): + M = tune_df.loc[i, "M"] + N = tune_df.loc[i, "N"] + K = tune_df.loc[i, "K"] + kid = tune_df.loc[i, "kernelId"] + tune_dict[(M, N, K)] = kernels_list[kid] + return tune_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK gemm a8w8 kernel", + ) + + # the directory for list_blobs/gen_blobs to write files into + parser.add_argument( + "-w", + "--working_path", + default="./", + required=False, + help="the path where all the blobs are going to be generated", + ) + + parser.add_argument( + "-f", + "--tune_file", + default="aiter/configs/a8w8_tuned_gemm.csv", + required=False, + help="tune_file include the result after run gemm_a8w8_tune.py", + ) + + parser.add_argument( + "--tune", action="store_true", required=False, help="generated tune instances" + ) + + # parser.add_argument( + # "--out_type", + # default="all", + # required=False, + # help="Specifie the type of scale\n \ + # all: [bf16, fp16] \n \ + # bf16, fp16" + # ) + + # parser.add_argument( + # "--scale_type", + # default="all", + # required=False, + # help="Specifie the type of scale\n \ + # all: [fp32, same as out] \n \ + # same: [same as out]" + # ) + + args = parser.parse_args() + codegen = gemm_a8w8_fwd_codegen(args.working_path, args.tune) + + if args.tune: + codegen.gen_instances(kernels_list) + else: + codegen.gen_instances(get_tune_dict(args.tune_file)) diff --git a/csrc/ck_gemm_a8w8/include/gemm_a8w8.h b/csrc/ck_gemm_a8w8/include/gemm_a8w8.h new file mode 100644 index 0000000000000000000000000000000000000000..ec0de5239ea7f5a9929c21214a4f4d57b6cfbb90 --- /dev/null +++ b/csrc/ck_gemm_a8w8/include/gemm_a8w8.h @@ -0,0 +1,24 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include +#include +torch::Tensor gemm_a8w8( + // void gemm_a8w8( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int splitK); + +torch::Tensor gemm_a8w8_tune( + // void gemm_a8w8( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + int kernelId, + int splitK); diff --git a/csrc/ck_gemm_a8w8/include/gemm_a8w8_common.cuh b/csrc/ck_gemm_a8w8/include/gemm_a8w8_common.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9723f4625c8d9eee0f2b04576be544270155ac72 --- /dev/null +++ b/csrc/ck_gemm_a8w8/include/gemm_a8w8_common.cuh @@ -0,0 +1,282 @@ +#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#undef __HIP_NO_HALF_OPERATORS__ +#undef __HIP_NO_HALF_CONVERSIONS__ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/utility/check_err.hpp" + +#include "ck/utility/blkgemmpipe_scheduler.hpp" + +template +using S = ck::Sequence; + +using I8 = int8_t; +using F8 = ck::f8_t; +using I32 = int; +using F16 = ck::half_t; +using B16 = ck::bhalf_t; +using FP8 = ck::f8_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +// using ADataType = I8; +// using BDataType = I8; +// using AccDataType = I32; +// using CShuffleDataType = F32; +// using ComputeDataType = I8; + +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using D1Layout = Col; +using D2Layout = Row; +using DsLayout = ck::Tuple; +using DsLayout2 = ck::Tuple; +using ELayout = Row; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; + +template +struct RowwiseScale +{ + template + __host__ __device__ constexpr void + operator()(E &e, const C &c, const D0 &d0, const D1 &d1) const; + + template <> + __host__ __device__ constexpr void operator()( + EDataType &e, const AccDataType &c, const DDataType &d0, const DDataType &d1) const + { + const F32 x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1); + + e = ck::type_convert(x0_f); + } +}; + +template +struct MultiplyMultiplyAdd +{ + template + __host__ __device__ constexpr void + operator()(E &e, const C &c, const D0 &d0, const D1 &d1, const D2 &d2) const; + + template <> + __host__ __device__ constexpr void operator()( + EDataType &e, const AccDataType &c, const DDataType &d0, const DDataType &d1, const EDataType &d2) const + { + const float x0_f = + ck::type_convert(c) * ck::type_convert(d0) * ck::type_convert(d1) + ck::type_convert(d2); + + e = ck::type_convert(x0_f); + } +}; + +// using CDEElementOp = RowwiseScale; +// using CDEElementOp2 = MultiplyMultiplyAdd; + +template +using DsDataType = ck::Tuple; + +template +using DsDataType2 = ck::Tuple; + +#if 0 +template +using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3 +// clang-format off +///######| ALayout| BLayout| DsLayout| ELayout| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +///######| | | | | Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| +///######| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| +///######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| +///###### RRR +/// < Row, Row, DsLayout, ELayout, ADataType, BDataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<16, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 4, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>; +///###### RCR + < Row, Col, DsLayout, ELayout, ADataType, BDataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, 1, 1, S<1, 32, 1, 8>, S<8, 8, 1>, ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, I8>; +// clang-format on +#endif + +template < + typename ABDataType, + typename AccDataType, + typename DDataType, + typename EDataType, + typename CDEElementOp, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int WAVE_TILE_M, + int WAVE_TILE_N, + int WAVE_MAP_M, + int WAVE_MAP_N, + typename ABLOCK_TRANSFER, + typename BBLOCK_TRANSFER, + typename CBLOCK_TRANSFER, + typename CBLOCK_SPV, + int CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + int CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + ck::BlockGemmPipelineScheduler LOOP_SCHED, + ck::BlockGemmPipelineVersion PIPELINE_VERSION, + auto GEMM_SPEC = + ck::tensor_operation::device::GemmSpecialization::MNPadding> +using DeviceGemmHelper = + ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3< + ALayout, + BLayout, + std::conditional_t>, DsLayout, DsLayout2>, + ELayout, + ABDataType, + ABDataType, + std::conditional_t>, DsDataType, DsDataType2>, + EDataType, + AccDataType, + AccDataType, + AElementOp, + BElementOp, + CDEElementOp, + GEMM_SPEC, + BLOCK_SIZE, // Block Size + MBLOCK, // M per Block + NBLOCK, // N per Block + KBLOCK, // K per Block + KBLOCK / ABLOCK_TRANSFER{}.At(0), // AK1 + 16, // BK1 + WAVE_TILE_M, // M per Xdl + WAVE_TILE_N, // N per Xdl + WAVE_MAP_M, // Mxdl per Wave + WAVE_MAP_N, // Nxdl per Wave + ABLOCK_TRANSFER, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + KBLOCK / ABLOCK_TRANSFER{}.At(0), + KBLOCK / ABLOCK_TRANSFER{}.At(0), + 0, + BBLOCK_TRANSFER, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 16, + 16, + 0, + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + CBLOCK_TRANSFER, + CBLOCK_SPV, + LOOP_SCHED, + PIPELINE_VERSION, + ABDataType>; + + +template +__forceinline__ torch::Tensor gemm_a8w8_rowwise_impl( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + std::optional bias, + int KBatch) +{ + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + + int StrideA = K; + int StrideB = K; + int StrideE = N; + + const at::cuda::OptionalCUDAGuard device_guard(device_of(XQ)); + auto device_gemm = DeviceGemmInstance{}; + auto invoker = device_gemm.MakeInvoker(); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + using AccDataType = std::conditional_t, I32, F32>; + if constexpr(has_bias) + { + auto cde_element_op = MultiplyMultiplyAdd{}; + + auto argument = device_gemm.MakeArgument( + reinterpret_cast(XQ.data_ptr()), + reinterpret_cast(WQ.data_ptr()), + std::array{reinterpret_cast(w_scale.data_ptr()), + reinterpret_cast(x_scale.data_ptr()), + reinterpret_cast(bias.value().data_ptr())}, + reinterpret_cast(Y.data_ptr()), + M, + N, + K, + StrideA, + StrideB, + std::array{0, 0, 0}, + StrideE, + KBatch, + a_element_op, + b_element_op, + cde_element_op); + TORCH_CHECK(device_gemm.IsSupportedArgument(argument), "This GEMM is not supported!"); + + invoker.Run(argument, StreamConfig{at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream()}); + } + else + { + auto cde_element_op = RowwiseScale{}; + + auto argument = device_gemm.MakeArgument( + reinterpret_cast(XQ.data_ptr()), + reinterpret_cast(WQ.data_ptr()), + std::array{reinterpret_cast(w_scale.data_ptr()), + reinterpret_cast(x_scale.data_ptr())}, + reinterpret_cast(Y.data_ptr()), + M, + N, + K, + StrideA, + StrideB, + std::array{0, 0}, + StrideE, + KBatch, + a_element_op, + b_element_op, + cde_element_op); + TORCH_CHECK(device_gemm.IsSupportedArgument(argument), "This GEMM is not supported!"); + + invoker.Run(argument, StreamConfig{at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream()}); + } + return Y; +} + +#endif // USE_ROCM diff --git a/csrc/ck_gemm_a8w8_blockscale/README.md b/csrc/ck_gemm_a8w8_blockscale/README.md new file mode 100644 index 0000000000000000000000000000000000000000..78b6345d4ed96f68f3503760e15adf159a49eaa1 --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/README.md @@ -0,0 +1,28 @@ +# CK gemm a8w8 blockscale tune + +1. Install aiter: +`cd $aiter_path` +`python3 setup.py develop` + +2. Add GEMM shapes in `aiter/configs/a8w8_blockscale_untuned_gemm.csv` + |**M**|**N**|**K**| + |-----|-----|-----| + |128 |1536 |7168 | + +3. Start tuning: +Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_blockscale_tune via jit: +`python3 csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py -i aiter/configs/a8w8_blockscale_untuned_gemm.csv -o aiter/configs/a8w8_blockscale_tuned_gemm.csv` +You can find the results of the tuning in `aiter/configs/a8w8_blockscale_tuned_gemm.csv`. + |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**| + |----------|-----|-----|-----|------------|----------|------|--------------| + |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx | + + `cu_num` means the number of compute units, and it is used to distinguish between graphics. + +4. Build tuned kernels and test: +Test the performance, modify the test instance in `op_tests/test_gemm_a8w8_blockscale.py` and run it, please wait a few minutes as it will build gemm_a8w8_blockscale tuned kernels in `aiter/configs/a8w8_blockscale_tuned_gemm.csv` via jit: +`python3 op_tests/test_gemm_a8w8_blockscale.py` +If you have built gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `aiter/configs/test_gemm_a8w8_blockscale.csv`. + +## More +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. \ No newline at end of file diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu new file mode 100644 index 0000000000000000000000000000000000000000..9a3420c1628c976f7c605f0b9073dd42b12ec091 --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale.cu @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: MIT + + +#include "gemm_a8w8_blockscale_common.cuh" +#include "gemm_a8w8_blockscale_manifest.h" +#include "gemm_a8w8_blockscale_lookup.h" +#include + + +using BlockwiseKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, torch::Tensor &, + torch::Tensor &)>; + +// Define a custom hash function for std::tuple +struct IntTupleHash +{ + size_t operator()(const std::tuple &t) const + { + auto hash1 = std::hash{}(std::get<0>(t)); + auto hash2 = std::hash{}(std::get<1>(t)); + auto hash3 = std::hash{}(std::get<2>(t)); + return hash1 ^ hash2 ^ hash3; + } +}; + +using BlockwiseKernelMap = std::unordered_map< + std::tuple, + BlockwiseKernel, + IntTupleHash>; + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +BlockwiseKernel blockscale_dispatch(int M, int N, int K) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + static const auto lookup = [] + { + if constexpr (std::is_same_v) { + return BlockwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,F16)}; + } else if constexpr (std::is_same_v) { + return BlockwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,B16)}; + } else { + static_assert(false, "blockscale_dispatch used with unsupported dtype!"); + } }(); + + // First check if this shape(M,N,K) is available in the direct lookup. + auto it = lookup.find({M, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + + int padded_m = M; + if (M > 1 && M <= 16) + { + padded_m = 16; + } + else if (M <= 16384) + { + padded_m = nextPow2(M); + } + else if (M <= 20480) + { + padded_m = 20480; + } + // Second check if this shape(padded_m,N,K) is available in the direct lookup. + it = lookup.find({padded_m, N, K}); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return a8w8_blockscale_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1; +} + +torch::Tensor gemm_a8w8_blockscale( + torch::Tensor& XQ, + torch::Tensor& WQ, + torch::Tensor& x_scale, + torch::Tensor& w_scale, + torch::Tensor& Y) +{ + TORCH_CHECK(XQ.dtype() == WQ.dtype(), "Weights and activations should have the same dtype!"); + TORCH_CHECK(x_scale.dtype() == w_scale.dtype(), + "Scales should have the same dtype!"); + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + + if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::Half) + { + blockscale_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y); + } + else if (x_scale.dtype() == at::ScalarType::Float && Y.dtype() == at::ScalarType::BFloat16) + { + blockscale_dispatch(M, N, K)(XQ, WQ, x_scale, w_scale, Y); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + return Y; +} diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_common.py b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_common.py new file mode 100644 index 0000000000000000000000000000000000000000..32382505dc32e430825d89d3ab95bbcf5cb8885d --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_common.py @@ -0,0 +1,551 @@ +# SPDX-License-Identifier: MIT +from dataclasses import dataclass + + +@dataclass +class kernelInstance: + BLOCK_SIZE: int + ScaleBlockM: int + ScaleBlockN: int + ScaleBlockK: int + MPerBLOCK: int + NPerBLOCK: int + KPerBLOCK: int + AK1: int + BK1: int + MPerXDL: int + NPerXDL: int + WAVE_MAP_M: int + WAVE_MAP_N: int + ABLOCK_TRANSFER: list[int] + BBLOCK_TRANSFER: list[int] + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE: int + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE: int + CBLOCK_TRANSFER: list[int] + CBLOCK_SPV: list[int] + PIPELINE_Sched: str + PIPELINE_VERSION: int + + @property + def name(self) -> str: + return ("_").join( + [ + "a8w8_blockscale", + ("x").join( + map( + lambda x: str(x), + [self.ScaleBlockM, self.ScaleBlockN, self.ScaleBlockK], + ) + ), + ("x").join( + map( + lambda x: str(x), + [ + self.BLOCK_SIZE, + self.MPerBLOCK, + self.NPerBLOCK, + self.KPerBLOCK, + ], + ) + ), + ("x").join(map(lambda x: str(x), [self.AK1, self.BK1])), + ("x").join(map(lambda x: str(x), [self.MPerXDL, self.NPerXDL])), + ("x").join(map(lambda x: str(x), self.ABLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.BBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_TRANSFER)), + ("x").join(map(lambda x: str(x), self.CBLOCK_SPV)), + ("x").join( + map( + lambda x: str(x), + [ + self.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + self.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + ], + ) + ), + self.PIPELINE_Sched.lower(), + f"v{self.PIPELINE_VERSION}", + ] + ) + + +kernels_list = { + # clang-format off + ##############| Block| Scale| Scale| Scale| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| BBlockTransfer| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + ###############| Size| Block| Block| Block| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline | Pipeline| + ###############| | M| N| K| | | | | | | | Wave| Wave| Lengths_K0_M_K1| Lengths_K0_N_K1| PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler | Verision| + ###############| | | | | | | | | | | | | | | | | | | | | | + # Compute friendly + 0: kernelInstance( + 256, + 1, + 128, + 128, + 128, + 128, + 128, + 16, + 16, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 3, + ), + 1: kernelInstance( + 256, + 1, + 128, + 128, + 128, + 64, + 128, + 16, + 16, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 3, + ), + 2: kernelInstance( + 256, + 1, + 128, + 128, + 64, + 128, + 128, + 16, + 16, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 3, + ), + 3: kernelInstance( + 256, + 1, + 128, + 128, + 64, + 64, + 128, + 16, + 16, + 32, + 32, + 1, + 1, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 3, + ), + # Memory friendly + 4: kernelInstance( + 256, + 1, + 128, + 128, + 16, + 256, + 128, + 8, + 16, + 16, + 16, + 1, + 4, + [16, 16, 1], + [8, 32, 1], + 1, + 2, + [1, 16, 1, 16], + [8], + "Intrawave", + 1, + ), + 5: kernelInstance( + 256, + 1, + 128, + 128, + 16, + 128, + 128, + 8, + 16, + 16, + 16, + 1, + 2, + [16, 16, 1], + [8, 32, 1], + 1, + 2, + [1, 16, 1, 16], + [8], + "Intrawave", + 1, + ), + 6: kernelInstance( + 256, + 1, + 128, + 128, + 16, + 64, + 128, + 8, + 16, + 16, + 16, + 1, + 1, + [16, 16, 1], + [8, 32, 1], + 1, + 1, + [1, 16, 1, 16], + [4], + "Intrawave", + 1, + ), + 7: kernelInstance( + 256, + 1, + 128, + 128, + 16, + 128, + 256, + 16, + 16, + 16, + 16, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + 1, + 2, + [1, 16, 1, 16], + [8], + "Intrawave", + 1, + ), + 8: kernelInstance( + 256, + 1, + 128, + 128, + 16, + 64, + 256, + 16, + 16, + 16, + 16, + 1, + 1, + [16, 16, 1], + [16, 16, 1], + 1, + 1, + [1, 16, 1, 16], + [4], + "Intrawave", + 1, + ), + 9: kernelInstance( + 256, + 1, + 128, + 128, + 32, + 256, + 128, + 16, + 16, + 32, + 32, + 1, + 2, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 10: kernelInstance( + 256, + 1, + 128, + 128, + 32, + 128, + 128, + 16, + 16, + 32, + 32, + 1, + 1, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 11: kernelInstance( + 256, + 1, + 128, + 128, + 32, + 64, + 128, + 16, + 16, + 16, + 16, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + 2, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 12: kernelInstance( + 256, + 1, + 128, + 128, + 32, + 128, + 256, + 16, + 16, + 32, + 32, + 1, + 1, + [16, 16, 1], + [16, 16, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 13: kernelInstance( + 256, + 1, + 128, + 128, + 32, + 64, + 256, + 16, + 16, + 16, + 16, + 2, + 1, + [16, 16, 1], + [16, 16, 1], + 2, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 14: kernelInstance( + 256, + 1, + 128, + 128, + 64, + 256, + 128, + 16, + 16, + 32, + 32, + 2, + 2, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 15: kernelInstance( + 256, + 1, + 128, + 128, + 64, + 128, + 128, + 16, + 16, + 32, + 32, + 2, + 1, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 16: kernelInstance( + 256, + 1, + 128, + 128, + 64, + 64, + 128, + 16, + 16, + 32, + 32, + 1, + 1, + [8, 32, 1], + [8, 32, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 17: kernelInstance( + 256, + 1, + 128, + 128, + 64, + 128, + 256, + 16, + 16, + 32, + 32, + 2, + 1, + [16, 16, 1], + [16, 16, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + 18: kernelInstance( + 256, + 1, + 128, + 128, + 64, + 64, + 256, + 16, + 16, + 32, + 32, + 1, + 1, + [16, 16, 1], + [16, 16, 1], + 1, + 1, + [1, 32, 1, 8], + [8], + "Intrawave", + 1, + ), + # clang-format on +} + + +default_kernels_dict = { + # clang-format off + ##############| Block| Scale| Scale| Scale| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| BBlockTransfer| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm| + ###############| Size| Block| Block| Block| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector| Pipeline | Pipeline| + ###############| | M| N| K| | | | | | | | Wave| Wave| Lengths_K0_M_K1| Lengths_K0_N_K1| PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl| Scheduler | Verision| + ###############| | | | | | | | | | | | | | | | | | | | | | + # Compute friendly + (-1): kernelInstance( + 256, + 1, + 128, + 128, + 16, + 128, + 256, + 16, + 16, + 16, + 16, + 1, + 2, + [16, 16, 1], + [16, 16, 1], + 1, + 2, + [1, 16, 1, 16], + [8], + "Intrawave", + 1, + ), +} diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.cu b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.cu new file mode 100644 index 0000000000000000000000000000000000000000..44c82d271469bcef2c3e63924024e4d25e6b5441 --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.cu @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: MIT + + +#include "gemm_a8w8_blockscale_common.cuh" +#include "gemm_a8w8_blockscale_manifest.h" +#include "gemm_a8w8_blockscale_lookup.h" +#include + +using BlockwiseKernel = std::function< + torch::Tensor(torch::Tensor &, torch::Tensor &, + torch::Tensor &, torch::Tensor &, + torch::Tensor &)>; + +// For certain high priority shapes, we directly use the best kernel rather +// than use heuristics. +using BlockwiseKernelMap = std::unordered_map< + int, + BlockwiseKernel>; + +// Helper function to return the next largest power of 2 +static constexpr int nextPow2(unsigned int num) +{ + if (num <= 1) + return 1; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +template +BlockwiseKernel blockwise_dispatch(int id) +{ + // For a given shape, either find the best kernel via lookup or heuristic. + // For many small M shapes, we bucket them to the next largest kernel. + // This is fine since kernels are padded anyway. + + // First check if this shape is available in the direct lookup. + static const auto lookup = [] + { + if constexpr (std::is_same_v) { + return BlockwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,F16)}; + } else if constexpr (std::is_same_v) { + return BlockwiseKernelMap{GENERATE_LOOKUP_TABLE(DDataType,B16)}; + } else { + static_assert(false, "blockwise_dispatch used with unsupported dtype!"); + } }(); + + TORCH_CHECK(id < lookup.size(), + "Kernel id " + std::to_string(id) +" is out of range!"); + auto it = lookup.find(id); + // If we found an optimal kernel, use it. + if (it != lookup.end()) + { + return it->second; + } + // Otherwise, use heuristics. + return lookup.find(0)->second; +} + + + +torch::Tensor gemm_a8w8_blockscale_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + int kernelId, + int splitK) +{ + TORCH_CHECK(XQ.dtype() == WQ.dtype(), "Weights and activations should have the same dtype!"); + TORCH_CHECK( x_scale.dtype() == w_scale.dtype(), + "Scales should have the same dtype!"); + std::optional bias = std::nullopt; + + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + int KBatch = std::pow(2, splitK); + + if (Y.dtype() == at::ScalarType::BFloat16) + { + blockwise_dispatch(kernelId)(XQ, WQ, x_scale, w_scale, Y); + } + else + { + TORCH_CHECK(false, "Unsupported scales/output dtype!"); + } + return Y; +} diff --git a/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py new file mode 100644 index 0000000000000000000000000000000000000000..acc519a9b0f6e34a3eac24f0c2065087ce6e74f9 --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py @@ -0,0 +1,227 @@ +# SPDX-License-Identifier: MIT +import os +import aiter +import pandas as pd +import torch +import torch.nn.functional as F +from aiter import dtypes +from aiter.test_common import perftest +from gemm_a8w8_blockscale_common import kernels_list +import argparse +from einops import rearrange + +block_shape = (128, 128) + + +def checkClose(a, b, rtol=1e-3, atol=0.01): + isClose = torch.isclose(a, b, rtol=rtol, atol=atol) + mask = ~isClose + if isClose.all(): + return True + else: + percent = (a[mask]).numel() / a.numel() + if percent > 0.01: + return False + else: + return True + + +def run_torch(x, weight, x_scale, w_scale, bias=None, dtype=dtypes.bf16): + block_shape_n, block_shape_k = block_shape + m, k = x.shape + n = weight.shape[0] + scale_n = (n + block_shape_n - 1) // block_shape_n + scale_k = (k + block_shape_k - 1) // block_shape_k + # x_scale = rearrange(x_scale.view(-1, 1).repeat(1, block_shape_n*block_shape_k).view(m, scale_k, 1, block_shape_k), + # 'num_blk_n num_blk_k blk_n blk_k ->(num_blk_n blk_n) (num_blk_k blk_k)') + x = x.to(x_scale.dtype).view( + m, k // block_shape[1], block_shape[1] + ) * x_scale.unsqueeze(-1) + x = x.view(m, k) + + w_scale = rearrange( + w_scale.view(-1, 1) + .repeat(1, block_shape_n * block_shape_k) + .view(scale_n, scale_k, block_shape_n, block_shape_k), + "num_blk_n num_blk_k blk_n blk_k -> (num_blk_n blk_n) (num_blk_k blk_k)", + ) + w_scale = w_scale[:n, :k] + weight = weight.to(w_scale.dtype) * w_scale + + out = F.linear(x.to(dtypes.fp32), weight.to(dtypes.fp32)) + # scale = torch.matmul(x_scale, w_scale) + # out = torch.mul(x, scale) + if bias is not None: + out = out.to(bias) + bias + return out.to(dtype) + + +def get_untuned_gemm_list(untuned_gemm_file): + assert os.path.exists( + untuned_gemm_file + ), f"Not exist a8w8_untuned_gemm.csv file: {untuned_gemm_file}" + untunedf = pd.read_csv(untuned_gemm_file) + return untunedf + + +def get_tuned_gemm_list(tuned_gemm_file): + if os.path.exists(tuned_gemm_file): + tunedf = pd.read_csv(tuned_gemm_file) + else: + tunedf = pd.DataFrame( + columns=["M", "N", "K", "kernelId", "splitK", "us", "kernelName"] + ) + return tunedf + + +@perftest() +def kernel_instance_test(x, weight, x_scale, w_scale, out, kernel_id, splitK=0): + aiter.gemm_a8w8_blockscale_tune(x, weight, x_scale, w_scale, out, kernel_id, splitK) + return out + + +def tune_gemm(m, n, k, useSplitK=False): + dim = (m, n, k) + block_shape_n, block_shape_k = block_shape + scale_n = (n + block_shape_n - 1) // block_shape_n + scale_k = (k + block_shape_k - 1) // block_shape_k + x = (torch.rand((m, k), dtype=dtypes.fp16, device="cuda") / 10).to(dtypes.fp8) + weight = (torch.rand((n, k), dtype=dtypes.fp16, device="cuda") / 10).to(dtypes.fp8) + x_scale = torch.rand([m, scale_k], dtype=dtypes.fp32, device="cuda") + w_scale = torch.rand([scale_n, scale_k], dtype=dtypes.fp32, device="cuda") + out = torch.empty(m, n, dtype=dtypes.bf16, device="cuda") + + ref_out = run_torch(x, weight, x_scale, w_scale) + + print(f"*******************M:{m} X N:{n} X K:{k}**************************") + print(f"Start tuning a8w8 gemm kernel for M:{m}, N:{n}, K{k}:") + kernels_num = len(kernels_list) + best_kernelConfig = (-1, 0) + best_time = -1 + for i in range(kernels_num): + kernel = kernels_list[i] + maxsplitK = ( + aiter.compute_gemm_SplitK( + m, n, k, kernel.MPerBLOCK, kernel.NPerBLOCK, kernel.KPerBLOCK + ) + if useSplitK + else 0 + ) + for splitK in range(maxsplitK + 1): + try: + (out), avg_t = kernel_instance_test( + x, weight, x_scale, w_scale, out, i, splitK + ) + isClosed = checkClose(ref_out, out, rtol=1e-2, atol=0.1) + if isClosed: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t avg: {avg_t:<8.2f} us, {kernel.name}, {splitK=}" + ) + if best_time < 0 or avg_t < best_time: + best_kernelConfig = (i, splitK) + best_time = avg_t + else: + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No pass , {kernel.name}, {splitK=}" + ) + except RuntimeError as e: + print(e) + print( + f"{str(dim):<20} kernelid:{i:<3d}\t No support , {kernel.name}, {splitK=}" + ) + + best_kernelId, splitK = best_kernelConfig + if best_kernelConfig[0] == -1: + print(f"No kernel can be used for M:{m}, N:{n}, K:{k}") + best_time = "nan" + else: + best_time = round(best_time, 4) + + print( + f"Tuning result for M:{m}, N:{n}, K:{k} is kernelId={best_kernelId} {kernels_list[best_kernelId].name} {splitK=}, {best_time}us" + ) + print(f"*******************M:{m} X N:{n} X K{k}**************************") + + return best_kernelId, splitK, best_time + + +def tune_gemm_list(untunedf, tunedf, issorted=False, useSplitK=False): + gpu = torch.cuda.current_device() + device_properties = torch.cuda.get_device_properties(gpu) + cu_num = device_properties.multi_processor_count + for i in range(len(untunedf)): + M = untunedf.loc[i, "M"] + N = untunedf.loc[i, "N"] + K = untunedf.loc[i, "K"] + + if tunedf[ + (tunedf["M"] == M) + & (tunedf["N"] == N) + & (tunedf["K"] == K) + & (tunedf["cu_num"] == cu_num) + ].empty: + kernelId, splitK, time = tune_gemm(M, N, K, useSplitK) + kernelName = "None" if kernelId == -1 else kernels_list[kernelId].name + temp = pd.DataFrame( + { + "M": [M], + "N": [N], + "K": [K], + "cu_num": [cu_num], + "kernelId": [kernelId], + "splitK": [splitK], + "us": [time], + "kernelName": [kernelName], + } + ) + tunedf = pd.concat([tunedf, temp], ignore_index=True) + + else: + print(f"M:{M}, N:{N}, K{K} is in tuned gemm, skip!!!") + print() + print() + if issorted: + tunedf = tunedf.sort_values(by=["cu_num", "M", "N", "K"]) + print("Totall tuning result:") + print(tunedf) + return tunedf + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK gemm a8w8 kernel", + ) + + parser.add_argument( + "-i", + "--untune_file", + default="aiter/configs/a8w8_blockscale_untuned_gemm.csv", + required=False, + help="input", + ) + + parser.add_argument( + "-o", + "--tune_file", + default="aiter/configs/a8w8_blockscale_tuned_gemm.csv", + required=False, + help="output: tuning result store this file", + ) + + parser.add_argument( + "-k", "--splitK", action="store_true", required=False, help="Use splitK kernels" + ) + + parser.add_argument( + "--sort", + action="store_true", + required=False, + help="Arranged according to the M N K size", + ) + + args = parser.parse_args() + untunedf = get_untuned_gemm_list(args.untune_file) + tunedf = get_tuned_gemm_list(args.tune_file) + tunedf = tune_gemm_list(untunedf, tunedf, args.sort, args.splitK) + tunedf.to_csv(args.tune_file, index=False) diff --git a/csrc/ck_gemm_a8w8_blockscale/gen_instances.py b/csrc/ck_gemm_a8w8_blockscale/gen_instances.py new file mode 100644 index 0000000000000000000000000000000000000000..8d776e4fd5eab7d5412e323772a4d588e4916fa3 --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/gen_instances.py @@ -0,0 +1,298 @@ +# SPDX-License-Identifier: MIT +import os +from pathlib import Path +import pandas as pd +import argparse +import shutil +import torch +from gemm_a8w8_blockscale_common import ( + kernelInstance, + kernels_list, + default_kernels_dict, +) + + +""" + +a8w8_blockscale_gemm instance gen + +""" + + +class gemm_a8w8_blockscale_codegen: + def __init__(self, working_path, istune=False): + self.working_path = working_path + self.impl_path = os.path.join(working_path, "impl") + self.instances_path = os.path.join(working_path, "instances") + self.istune = istune + + def gen_instance(self, k: kernelInstance): + INSTANCE_IMPL = f"""// SPDX-License-Identifier: MIT + + +#include "gemm_a8w8_blockscale_common.cuh" + +template +torch::Tensor +{k.name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y + ) +{{{{ + // The smallest kernel we have available. Works well for memory bound shapes. + + // Check if this input needs to be padded. + int M = size_to_dim_(XQ.dim() - 1, XQ.sizes()); + int N = WQ.size(0); + int K = WQ.size(1); + bool pad = (M % {k.MPerBLOCK} != 0) || (N % {k.NPerBLOCK} != 0) || (K % ({k.KPerBLOCK}) != 0); + if (pad) + {{{{ + // pad + {{INSTANCE_CONTENT_pad}} + // pad + }}}} + else + {{{{ + // no pad + {{INSTANCE_CONTENT_nopad}} + // no pad + }}}} +}}}} + +""" + + INSTANCE_CONTENT_nobias = f"""using DeviceGemmInstance = DeviceGemmHelperF8BlockScale< + DDataType, EDataType, + {k.BLOCK_SIZE}, + {k.ScaleBlockM}, {k.ScaleBlockN}, {k.ScaleBlockK}, + {k.MPerBLOCK}, {k.NPerBLOCK}, {k.KPerBLOCK}, + {k.AK1}, {k.BK1}, + {k.MPerXDL}, {k.NPerXDL}, + {k.WAVE_MAP_M}, {k.WAVE_MAP_N}, + S<{(", ").join(map(lambda x:str(x),k.ABLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.BBLOCK_TRANSFER))}>, + {k.CSHUFFLE_MX_PER_WAVE_PERSHUFFLE}, + {k.CSHUFFLE_NX_PER_WAVE_PERSHUFFLE}, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_TRANSFER))}>, + S<{(", ").join(map(lambda x:str(x),k.CBLOCK_SPV))}>, + ck::BlockGemmPipelineScheduler::{k.PIPELINE_Sched}, + ck::BlockGemmPipelineVersion::v{k.PIPELINE_VERSION}, + ck::tensor_operation::device::GemmSpecialization::{{GemmSpec}}>; + // Run kernel instance. + return gemm_a8w8_blockscale_impl(XQ, WQ, x_scale, w_scale, Y); +""" + if self.istune: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="MNKPadding") + ), + INSTANCE_CONTENT_nopad=( + INSTANCE_CONTENT_nobias.format(GemmSpec="Default") + ), + ) + else: + INSTANCE_IMPL_str = INSTANCE_IMPL.format( + INSTANCE_CONTENT_pad=INSTANCE_CONTENT_nobias.format( + GemmSpec="MNKPadding" + ), + INSTANCE_CONTENT_nopad=INSTANCE_CONTENT_nobias.format( + GemmSpec="Default" + ), + ) + + Path(os.path.join(self.impl_path, f"{k.name}.cuh")).write_text( + INSTANCE_IMPL_str + ) + + INSTANCE_template = """// SPDX-License-Identifier: MIT + + +#include "{name}.cuh" + +template torch::Tensor +{name}<{dtypes}>( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y + ); + +""" + INSTANCE_dFP32_eBF16 = INSTANCE_template.format(name=k.name, dtypes="F32, B16") + INSTANCE_dFP32_eFP16 = INSTANCE_template.format(name=k.name, dtypes="F32, F16") + # TODO: dFP8_eFP8 + + if self.istune: + Path( + os.path.join(self.instances_path, f"{k.name}_dBF16_eBF16.cpp") + ).write_text(INSTANCE_dFP32_eBF16) + else: + Path( + os.path.join(self.instances_path, f"{k.name}_dFP32_eBF16.cpp") + ).write_text(INSTANCE_dFP32_eBF16) + Path( + os.path.join(self.instances_path, f"{k.name}_dFP32_eFP16.cpp") + ).write_text(INSTANCE_dFP32_eFP16) + + def gen_lookup_dict(self, kernels_dict): + LOOKUP_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#define GENERATE_LOOKUP_TABLE(DTYPE, ETYPE) \\ + { \\""" + + LOOKUP_template = """ + {{{MNK}, \\ + {kernel_name}}}, \\""" + + LOOKUP_end = """ + } + +#endif // USE_ROCM +""" + with open( + os.path.join(self.working_path, "gemm_a8w8_blockscale_lookup.h"), "w" + ) as f: + f.write(LOOKUP_head) + for mnk, k in kernels_dict.items(): + # print((", ").join(map(lambda x: str(x), list(mnk))), ":", k.name) + if not self.istune and (isinstance(mnk, tuple) and mnk[0] > 0): + f.write( + LOOKUP_template.format( + MNK="{" + + (", ").join(map(lambda x: str(x), list(mnk))) + + "}", + kernel_name=k.name, + ) + ) + elif self.istune and isinstance(mnk, int): + f.write(LOOKUP_template.format(MNK=mnk, kernel_name=k.name)) + f.write(LOOKUP_end) + + def gen_manifest_head(self, kernels_dict): + MAINFEST_head = """#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#include + +#include +""" + MAINFEST_template = """ +template +torch::Tensor +{kernel_name}( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y); +""" + MAINFEST_end = """ + +#endif // USE_ROCM +""" + + with open( + os.path.join(self.working_path, "gemm_a8w8_blockscale_manifest.h"), "w" + ) as f: + f.write(MAINFEST_head) + for mnk, k in kernels_dict.items(): + f.write(MAINFEST_template.format(kernel_name=k.name)) + f.write(MAINFEST_end) + + def gen_instances(self, kernels_dict): + if os.path.exists(self.impl_path): + shutil.rmtree(self.impl_path) + os.mkdir(self.impl_path) + if os.path.exists(self.instances_path): + shutil.rmtree(self.instances_path) + os.mkdir(self.instances_path) + + for mnk, k in kernels_dict.items(): + self.gen_instance(k) + + self.gen_lookup_dict(kernels_dict) + self.gen_manifest_head(kernels_dict) + + +def get_tune_dict(tune_dict_csv): + tune_dict = default_kernels_dict + if os.path.exists(tune_dict_csv): + tune_df = pd.read_csv(tune_dict_csv) + if torch.cuda.is_available(): + gpu = torch.cuda.current_device() + device_properties = torch.cuda.get_device_properties(gpu) + cu_num = device_properties.multi_processor_count + tune_df = tune_df[tune_df["cu_num"] == cu_num].reset_index() + for i in range(len(tune_df)): + M = tune_df.loc[i, "M"] + N = tune_df.loc[i, "N"] + K = tune_df.loc[i, "K"] + kid = tune_df.loc[i, "kernelId"] + tune_dict[(M, N, K)] = kernels_list[kid] + return tune_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK gemm a8w8 kernel", + ) + + # the directory for list_blobs/gen_blobs to write files into + parser.add_argument( + "-w", + "--working_path", + default="./", + required=False, + help="the path where all the blobs are going to be generated", + ) + + parser.add_argument( + "-f", + "--tune_file", + default="aiter/configs/a8w8_blockscale_tuned_gemm.csv", + required=False, + help="tune_file include the result after run gemm_a8w8_tune.py", + ) + + parser.add_argument( + "--tune", action="store_true", required=False, help="generated tune instances" + ) + + # parser.add_argument( + # "--out_type", + # default="all", + # required=False, + # help="Specifie the type of scale\n \ + # all: [bf16, fp16] \n \ + # bf16, fp16" + # ) + + # parser.add_argument( + # "--scale_type", + # default="all", + # required=False, + # help="Specifie the type of scale\n \ + # all: [fp32, same as out] \n \ + # same: [same as out]" + # ) + + args = parser.parse_args() + codegen = gemm_a8w8_blockscale_codegen(args.working_path, args.tune) + + if args.tune: + codegen.gen_instances(kernels_list) + else: + codegen.gen_instances(get_tune_dict(args.tune_file)) diff --git a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h new file mode 100644 index 0000000000000000000000000000000000000000..ffae22e70247b538e729f9d6ce5c6d56424e88c1 --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h @@ -0,0 +1,20 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include +#include +torch::Tensor gemm_a8w8_blockscale( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y); + +torch::Tensor gemm_a8w8_blockscale_tune( + torch::Tensor &XQ, + torch::Tensor &WQ, + torch::Tensor &x_scale, + torch::Tensor &w_scale, + torch::Tensor &Y, + int kernelId, + int splitK); diff --git a/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_common.cuh b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_common.cuh new file mode 100644 index 0000000000000000000000000000000000000000..07b4ef0f421b0512b45f2d9600b94aadf067c4d5 --- /dev/null +++ b/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale_common.cuh @@ -0,0 +1,166 @@ +#pragma once +// SPDX-License-Identifier: MIT + + +#ifdef USE_ROCM + +#undef __HIP_NO_HALF_OPERATORS__ +#undef __HIP_NO_HALF_CONVERSIONS__ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#include "ck/library/utility/check_err.hpp" + +#include "ck/utility/blkgemmpipe_scheduler.hpp" + +template +using S = ck::Sequence; + +using BF16 = ck::bhalf_t; +using B16 = ck::bhalf_t; +using FP8 = ck::f8_t; +using F32 = float; +using I8 = int8_t; +using I32 = int; +using F16 = ck::half_t; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using A0DataType = FP8; +using A1DataType = F32; +using B0DataType = FP8; +using B1DataType = F32; +using AccDataType = F32; +using CShuffleDataType = F32; +using DsDataType = ck::Tuple<>; +using EDataType = BF16; + +using A0Layout = Row; +using B0Layout = Col; +using D0Layout = Row; +using D1Layout = Col; +using DsLayout = ck::Tuple<>; +using ELayout = Row; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = PassThrough; + +// static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default; + +// static constexpr ck::index_t Scale_Block_M = 1; +// static constexpr ck::index_t Scale_Block_N = 128; +// static constexpr ck::index_t Scale_Block_K = 128; + +template +using DeviceGemmHelperF8BlockScale = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_Xdl_CShuffle_V3 + // clang-format off + , S<1, 0, 2>, + 2, AK1, AK1, 0, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + S<1, 0, 2>, S<1, 0, 2>, + 2, BK1, BK1, 0, + CSHUFFLE_MX_PER_WAVE_PERSHUFFLE, + CSHUFFLE_NX_PER_WAVE_PERSHUFFLE, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEShuffleBlockTransferScalarPerVectors, + BlkGemmPipeSched, + BlkGemmPipelineVer, A0DataType>; + // clang-format on + +template +__forceinline__ torch::Tensor gemm_a8w8_blockscale_impl( + torch::Tensor& XQ, + torch::Tensor& WQ, + torch::Tensor& x_scale, + torch::Tensor& w_scale, + torch::Tensor& Y) +{ + int M = XQ.size(0); + int N = WQ.size(0); + int K = XQ.size(1); + + int StrideA = XQ.stride(-2); + int StrideB = WQ.stride(-2); + int StrideE = N; + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{}; + + constexpr ck::index_t NumDTensor = DsDataType::Size(); + + // do GEMM + auto device_gemm = DeviceGemmInstance{}; + auto invoker = device_gemm.MakeInvoker(); + auto argument = device_gemm.MakeArgument(XQ.data_ptr(), + WQ.data_ptr(), + std::array{}, + reinterpret_cast(Y.data_ptr()), + M, + N, + K, + StrideA, + StrideB, + std::array{}, + StrideE, + reinterpret_cast(x_scale.data_ptr()), + reinterpret_cast(w_scale.data_ptr()), + a_element_op, + b_element_op, + cde_element_op); + + TORCH_CHECK(device_gemm.IsSupportedArgument(argument), "This GEMM is not supported!"); + + invoker.Run(argument, StreamConfig{at::cuda::getCurrentCUDAStream().stream()}); + return Y; +} + +#endif // USE_ROCM diff --git a/csrc/cpp_itfs/mha_bwd_generate.py b/csrc/cpp_itfs/mha_bwd_generate.py new file mode 100644 index 0000000000000000000000000000000000000000..2dd15b4c4b3f0e30d25e17a0e795c3e12d1e9bb2 --- /dev/null +++ b/csrc/cpp_itfs/mha_bwd_generate.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: MIT + # generate kernel instances to speed up compilation + +import argparse +from pathlib import Path +from typing import Optional + +GEN_DIR = "" # in Cmake, have to generate files in same folder + +AITER_API_FILENAME = "mha_bwd.cpp" + +AITER_CPP_API = """#include "mha_bwd.h" + +namespace aiter {{ +mha_bwd_traits get_mha_bwd_traits(int head_size_q, + int head_size_v, + std::string dtype, + bool is_group_mode, + mask_enum mask_type, + bias_enum bias_type, + bool has_dbias, + bool has_dropout, + bool is_store_randval, + bool deterministic, + bool use_ext_asm, + bool is_v3_atomic_fp32, + int how_v3_bf16_cvt) +{{ + return mha_bwd_traits(head_size_q, + head_size_v, + dtype, + is_group_mode, + mask_type, + bias_type, + has_dbias, + has_dropout, + is_store_randval, + deterministic, + use_ext_asm, + is_v3_atomic_fp32, + how_v3_bf16_cvt); +}} + +// share with varlen(group mode) api +float mha_bwd(mha_bwd_args args, + const ck_tile::stream_config& stream_config, + std::string q_dtype_str, + bool is_group_mode, + mask_enum mask_type, + bias_enum bias_type, + bool has_dbias, + bool is_store_randval, + bool deterministic, + bool use_ext_asm, + bool is_v3_atomic_fp32, + int how_v3_bf16_cvt) +{{ + int head_size_q = args.hdim_q; + int head_size_v = args.hdim_v; + bool has_dropout = args.p_drop > 0; + // bool enable_ailib = args.alibi_slopes_ptr == nullptr; + auto traits = get_mha_bwd_traits(head_size_q, + head_size_v, + q_dtype_str, + is_group_mode, + mask_type, + bias_type, + has_dbias, + has_dropout, + is_store_randval, + deterministic, + use_ext_asm, + is_v3_atomic_fp32, + how_v3_bf16_cvt); + float t = -1; + {F_dispatch} + return t; +}} +}} // namespace aiter + +""" + +V2_API = "t = fmha_bwd(traits, args, stream_config);" + +V3_API = "t = fmha_bwd_v3(traits, args, stream_config);" + +COMBINED_API = """t = fmha_bwd_v3(traits, args, stream_config); + if (t == -1) { t = fmha_bwd(traits, args, stream_config); } +""" + +API_MAP = {1: V2_API, 2: V3_API, 3: COMBINED_API} + + +def write_blobs(output_dir: Optional[str], receipt) -> None: + if output_dir is None: + output_dir = Path(__file__).parent + else: + output_dir = Path(output_dir) / GEN_DIR + + output_dir.mkdir(parents=True, exist_ok=True) + + api = AITER_CPP_API.format(F_dispatch=API_MAP[receipt]) + (output_dir / AITER_API_FILENAME).write_text(api) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK fmha kernel", + ) + parser.add_argument( + "-o", + "--output_dir", + required=False, + help="write all the blobs into a directory", + ) + parser.add_argument( + "-r", + "--receipt", + default=0, + required=False, + help="codegen receipt. 1: generate fmha v2 c++ api\n" + + " 2: generate fmha v3 c++ api\n" + + " 3: generate v2 v3 combined api for PREBUILD mode", + ) + + args = parser.parse_args() + + write_blobs(args.output_dir, int(args.receipt)) diff --git a/csrc/cpp_itfs/mha_fwd_generate.py b/csrc/cpp_itfs/mha_fwd_generate.py new file mode 100644 index 0000000000000000000000000000000000000000..69d89e61b4cef2edd3fd3e117b51e2979f416f1a --- /dev/null +++ b/csrc/cpp_itfs/mha_fwd_generate.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: MIT + # generate kernel instances to speed up compilation + +import argparse +from pathlib import Path +from typing import Optional + +GEN_DIR = "" # in Cmake, have to generate files in same folder + +AITER_API_FILENAME = "mha_fwd.cpp" + +AITER_CPP_API = """#include "mha_fwd.h" + +namespace aiter {{ +mha_fwd_traits get_mha_fwd_traits(int head_size_q, + int head_size_v, + std::string dtype, + bool is_group_mode, + bool has_logits_soft_cap, + mask_enum mask_type, + bias_enum bias_type, + bool has_lse, + bool has_dropout, + bool use_ext_asm) +{{ + return mha_fwd_traits(head_size_q, + head_size_v, + dtype, + is_group_mode, + has_logits_soft_cap, + mask_type, + bias_type, + has_lse, + has_dropout, + use_ext_asm); +}} + +mha_fwd_splitkv_traits get_mha_fwd_splitkv_traits(int head_size_q, + int head_size_v, + std::string dtype, + bool is_group_mode, + bool has_logits_soft_cap, + mask_enum mask_type, + bias_enum bias_type, + bool has_lse) +{{ + return mha_fwd_splitkv_traits(head_size_q, + head_size_v, + dtype, + is_group_mode, + has_logits_soft_cap, + mask_type, + bias_type, + has_lse); +}} +{F_dispatch} + +}} // namespace aiter + +""" + +FMHA_FWD_API = """ +float mha_fwd(mha_fwd_args args, + const ck_tile::stream_config& stream_config, + std::string q_dtype_str, + bool is_group_mode, + mask_enum mask_type, + bias_enum bias_type, + bool has_lse, + bool use_ext_asm) +{{ + int head_size_q = args.hdim_q; + int head_size_v = args.hdim_v; + bool has_dropout = args.p_drop > 0.f; + auto traits = get_mha_fwd_traits(head_size_q, + head_size_v, + q_dtype_str, + is_group_mode, + args.logits_soft_cap > 0.f, + mask_type, + bias_type, + has_lse, + has_dropout, + use_ext_asm); + float t = -1; + {F_inner_dispatch} + return t; +}}""" + +FMHA_FWD_SPLITKV_API = """ +float mha_fwd_splitkv(mha_fwd_splitkv_args args, + const ck_tile::stream_config& stream_config, + std::string q_dtype_str, + bool is_group_mode, + mask_enum mask_type, + bias_enum bias_type, + bool has_lse) +{ + int head_size_q = args.hdim_q; + int head_size_v = args.hdim_v; + auto traits = get_mha_fwd_splitkv_traits(head_size_q, + head_size_v, + q_dtype_str, + is_group_mode, + args.logits_soft_cap > 0.f, + mask_type, + bias_type, + has_lse); + return fmha_fwd_splitkv(traits, args, stream_config); +}""" + +FMHA_BATCH_PREFILL_API = """ +float mha_batch_prefill(mha_batch_prefill_args args, + const ck_tile::stream_config& stream_config, + std::string q_dtype_str, + bool is_group_mode, + mask_enum mask_type, + bias_enum bias_type, + bool has_lse, + bool use_ext_asm) +{ + int head_size_q = args.hdim_q; + int head_size_v = args.hdim_v; + bool has_dropout = args.p_drop > 0.f; + auto traits = get_mha_fwd_traits(head_size_q, + head_size_v, + q_dtype_str, + is_group_mode, + args.logits_soft_cap > 0.f, + mask_type, + bias_type, + has_lse, + has_dropout, + use_ext_asm); + return fmha_batch_prefill(traits, args, stream_config); +}""" + +V2_API = """t = fmha_fwd(traits, args, stream_config);""" + +V3_API = """t = fmha_fwd_v3(traits, args, stream_config);""" + +COMBINED_API = """t = fmha_fwd_v3(traits, args, stream_config); + if (t == -1) { t = fmha_fwd(traits, args, stream_config); } +""" + +API_MAP = { + 1: FMHA_FWD_API.format(F_inner_dispatch=V3_API), + 2: FMHA_FWD_API.format(F_inner_dispatch=V2_API), + 3: FMHA_FWD_API.format(F_inner_dispatch=V2_API) + FMHA_FWD_SPLITKV_API, + 4: FMHA_BATCH_PREFILL_API, + 5: FMHA_FWD_API.format(F_inner_dispatch=COMBINED_API) + + FMHA_FWD_SPLITKV_API + + FMHA_BATCH_PREFILL_API, +} + + +def write_blobs(output_dir: Optional[str], receipt) -> None: + if output_dir is None: + output_dir = Path(__file__).parent + else: + output_dir = Path(output_dir) / GEN_DIR + + output_dir.mkdir(parents=True, exist_ok=True) + + api = AITER_CPP_API.format(F_dispatch=API_MAP[receipt]) + (output_dir / AITER_API_FILENAME).write_text(api) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for CK fmha kernel", + ) + parser.add_argument( + "-o", + "--output_dir", + required=False, + help="write all the blobs into a directory", + ) + parser.add_argument( + "-r", + "--receipt", + default=0, + required=False, + help="codegen receipt. 1: generate mha_fwd asm c++ api\n" + + " 2: generate mha_fwd v2(ck) c++ api\n" + + " 3: generate fmha varlen fwd c++ api\n" + + " 4: generate mha_batch_prefill c++ api\n" + + " 5: generate all fmha fwd c++ api, also can be use for PREBUILD", + ) + + args = parser.parse_args() + + write_blobs(args.output_dir, int(args.receipt)) diff --git a/csrc/include/activation.h b/csrc/include/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..1d5f4fc378e3af00aa264fba7bcbdf5bf00dd67d --- /dev/null +++ b/csrc/include/activation.h @@ -0,0 +1,13 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +namespace aiter { + +void silu_and_mul(torch::Tensor &out, torch::Tensor &input); +void scaled_silu_and_mul(torch::Tensor &out, torch::Tensor &input, torch::Tensor &scale); +void gelu_and_mul(torch::Tensor &out, torch::Tensor &input); +void gelu_tanh_and_mul(torch::Tensor &out, torch::Tensor &input); + +} // namespace aiter diff --git a/csrc/include/aiter_enum.h b/csrc/include/aiter_enum.h new file mode 100644 index 0000000000000000000000000000000000000000..5b4d877c817c4737c18644f935d0ed3cf4834205 --- /dev/null +++ b/csrc/include/aiter_enum.h @@ -0,0 +1,19 @@ +#pragma once +// SPDX-License-Identifier: MIT + + +enum class ActivationType : int +{ + No = -1, + Gelu = 0, + Silu = 1, +}; +enum class QuantType : int +{ + No, + per_Tensor, + per_Token, + per_1x32, + per_1x128, + per_128x128, +}; diff --git a/csrc/include/aiter_hip_common.h b/csrc/include/aiter_hip_common.h new file mode 100644 index 0000000000000000000000000000000000000000..a6d2d3543d65b6de72d576b452b83f176a839e44 --- /dev/null +++ b/csrc/include/aiter_hip_common.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: MIT + +#pragma once +#include "ck_tile/core.hpp" +#include +#include + +enum class GPUArch +{ + gfx936, + gfx938, + gfx946, +}; + +#define HIP_CALL(call) \ + do \ + { \ + hipError_t err = call; \ + if(err != hipSuccess) \ + { \ + printf("\n[AITER] %s:%d fail to call %s ---> [HIP error](%s)\n", \ + __FILE__, \ + __LINE__, \ + #call, \ + hipGetErrorString(err)); \ + exit(0); \ + } \ + } while(0) + +struct p3 +{ + unsigned int _p0; + unsigned int _p1; + unsigned int _p2; +}; +struct p2 +{ + unsigned int _p0; + unsigned int _p1; +}; +struct p1 +{ + unsigned int _p0; +}; + +struct AiterAsmKernelArgs +{ + void *args_ptr; + void *arg_size_ptr; + int gdx; + int gdy; + int gdz; + int bdx; + int bdy; + int bdz; + const hipStream_t stream; +}; + +class AiterAsmKernel +{ +private: + hipModule_t module; + hipFunction_t kernel_func; + +public: + AiterAsmKernel(const char *name, const char *hsaco) + { + const char *AITER_ASM_DIR = std::getenv("AITER_ASM_DIR"); + std::cout << "[aiter] hipModuleLoad: " << (std::string(AITER_ASM_DIR) + hsaco).c_str() << " GetFunction: " << name; + HIP_CALL(hipModuleLoad(&module, (std::string(AITER_ASM_DIR) + hsaco).c_str())); + HIP_CALL(hipModuleGetFunction(&kernel_func, module, name)); + std::cout << " Success" << std::endl; + }; + + ~AiterAsmKernel() + { + HIP_CALL(hipModuleUnload(module)); + } + + void launch_kernel(const AiterAsmKernelArgs &kargs) + { + void *config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kargs.args_ptr, + HIP_LAUNCH_PARAM_BUFFER_SIZE, kargs.arg_size_ptr, + HIP_LAUNCH_PARAM_END}; + + HIP_CALL(hipModuleLaunchKernel(kernel_func, + kargs.gdx, kargs.gdy, kargs.gdz, + kargs.bdx, kargs.bdy, kargs.bdz, + 0, kargs.stream, nullptr, (void **)&config)); + }; +}; + +static const std::string get_gpu_arch() +{ + int device_count; + hipError_t err = hipGetDeviceCount(&device_count); + if(err != hipSuccess || device_count == 0) + { + return "No GPU Found"; + } + + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, 0); + + std::string arch_full = prop.gcnArchName; + size_t colon_pos = arch_full.find(':'); + if(colon_pos != std::string::npos) + { + return arch_full.substr(0, colon_pos); + } + else + { + return arch_full; + } +} + +static const uint32_t get_num_cu_func() +{ + auto get_num_cu_local = []() { + hipDevice_t dev; + hipDeviceProp_t dev_prop; + HIP_CALL(hipGetDevice(&dev)); + HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); + return dev_prop.multiProcessorCount; + }; + static const uint32_t num_cu = get_num_cu_local(); + return num_cu; +} diff --git a/csrc/include/aiter_operator.h b/csrc/include/aiter_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..7e79bf4387f24841539c086982d32e2fc246d181 --- /dev/null +++ b/csrc/include/aiter_operator.h @@ -0,0 +1,14 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +torch::Tensor aiter_add(torch::Tensor &input, torch::Tensor &other); +torch::Tensor aiter_mul(torch::Tensor &input, torch::Tensor &other); +torch::Tensor aiter_sub(torch::Tensor &input, torch::Tensor &other); +torch::Tensor aiter_div(torch::Tensor &input, torch::Tensor &other); + +torch::Tensor aiter_add_(torch::Tensor &input, torch::Tensor &other); +torch::Tensor aiter_mul_(torch::Tensor &input, torch::Tensor &other); +torch::Tensor aiter_sub_(torch::Tensor &input, torch::Tensor &other); +torch::Tensor aiter_div_(torch::Tensor &input, torch::Tensor &other); \ No newline at end of file diff --git a/csrc/include/aiter_unary.h b/csrc/include/aiter_unary.h new file mode 100644 index 0000000000000000000000000000000000000000..9f42ee9e5f8ea5eb3d27f86de52a1e00e91a62be --- /dev/null +++ b/csrc/include/aiter_unary.h @@ -0,0 +1,7 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +torch::Tensor aiter_sigmoid(torch::Tensor &input); +torch::Tensor aiter_tanh(torch::Tensor &input); diff --git a/csrc/include/asm_flatmm_a8w8_blockscale.h b/csrc/include/asm_flatmm_a8w8_blockscale.h new file mode 100644 index 0000000000000000000000000000000000000000..9946aa67ae5fc7ae2070cabc8cfbdcd86f42818d --- /dev/null +++ b/csrc/include/asm_flatmm_a8w8_blockscale.h @@ -0,0 +1,12 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +torch::Tensor flatmm_a8w8_blockscale_asm( + torch::Tensor &XQ, // [M, K] + torch::Tensor &WQ, // [N, K] -> [N/128, K*128] + torch::Tensor &x_scale, // [K/128, M] + torch::Tensor &w_scale, // [K/128, N/128] + torch::Tensor &out // Out:[M, N] fp16 +); diff --git a/csrc/include/asm_gemm_a8w8.h b/csrc/include/asm_gemm_a8w8.h new file mode 100644 index 0000000000000000000000000000000000000000..b5e3f748cda40f99fff4d88f8c7064c390db2042 --- /dev/null +++ b/csrc/include/asm_gemm_a8w8.h @@ -0,0 +1,17 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +torch::Tensor gemm_a8w8_asm(torch::Tensor &A, // A:[M, K] i8 + torch::Tensor &B, // B:[N, K] i8 -> shuffle layout(32,16) + torch::Tensor &A_scale, // A_scale:[M, 1] f32 + torch::Tensor &B_scale, // B_scale:[1, N] f32 + torch::Tensor &out, // Out:[M, N] bf16 + torch::Tensor &bias, // bias:[1, N] f32 + std::optional sub_m = 128, + std::optional sub_n = 128, + std::optional pad_a = 0, + std::optional pad_b = 0, + std::optional pad_c = 0, + std::optional splitK = 0); \ No newline at end of file diff --git a/csrc/include/attention.h b/csrc/include/attention.h new file mode 100644 index 0000000000000000000000000000000000000000..b94fe6f659711e1cf4502b37ca9c9dba587ed4d4 --- /dev/null +++ b/csrc/include/attention.h @@ -0,0 +1,14 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +void paged_attention( + torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits, + torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache, + torch::Tensor &value_cache, int64_t num_kv_heads, double scale, + torch::Tensor &block_tables, torch::Tensor &context_lens, + int64_t block_size, int64_t max_context_len, + const std::optional &alibi_slopes, + const std::string &kv_cache_dtype, double k_scale, double v_scale, + const std::optional &fp8_out_scale, int64_t partition_size); \ No newline at end of file diff --git a/csrc/include/attention_asm.h b/csrc/include/attention_asm.h new file mode 100644 index 0000000000000000000000000000000000000000..8ae2b1cdca64d10503ea12758e7f5e1a2916e834 --- /dev/null +++ b/csrc/include/attention_asm.h @@ -0,0 +1,15 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +torch::Tensor pa_fwd(torch::Tensor &Q, // [num_seqs, num_heads, head_size] + torch::Tensor &K, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + torch::Tensor &V, // [num_blocks, num_kv_heads, block_size/X, head_size, X] + torch::Tensor &block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor &context_lens, // [num_seqs] + int max_num_blocks, + std::optional &K_QScale, + std::optional &V_QScale, + std::optional &out_, + std::optional high_precision = 1); \ No newline at end of file diff --git a/csrc/include/attention_asm_mla.h b/csrc/include/attention_asm_mla.h new file mode 100644 index 0000000000000000000000000000000000000000..75b83e8c107f36f5c32b4f4670718b6ed4129312 --- /dev/null +++ b/csrc/include/attention_asm_mla.h @@ -0,0 +1,30 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +void mla_decode_stage1_asm_fwd(torch::Tensor &Q, // [num_seqs, num_heads, head_size] + torch::Tensor &KV, // [num_page, page_size, num_kv_heads, head_size] + torch::Tensor &qo_indptr, // [batch_size+1] + torch::Tensor &kv_indptr, // [batch_size+1] + torch::Tensor &kv_page_indices, // [num_page_used] + torch::Tensor &kv_last_page_lens, // [batch_size] + int max_seqlen_q, + float softmax_scale, + // following are output + torch::Tensor &splitData, //[batch_size, num_kv_splits, num_heads, v_head_dim] + torch::Tensor &splitLse //[batch_size, num_kv_splits, num_heads, 1] +); + +void mla_prefill_asm_fwd(torch::Tensor &Q, // [num_seqs, num_heads, head_size] + torch::Tensor &KV, // [num_page, page_size, num_kv_heads, kv_lora_rank + qk_rope_head_dim] + torch::Tensor &qo_indptr, // [batch_size+1] + torch::Tensor &kv_indptr, // [batch_size+1] + torch::Tensor &kv_page_indices, // [num_page_used] + torch::Tensor &kv_last_page_lens, // [batch_size] + int max_seqlen_q, + float softmax_scale, + // following are output + torch::Tensor &splitData, //[batch_size, num_kv_splits, num_heads, v_head_dim] + torch::Tensor &splitLse //[batch_size, num_kv_splits, num_heads, 1] +); \ No newline at end of file diff --git a/csrc/include/attention_ck.h b/csrc/include/attention_ck.h new file mode 100644 index 0000000000000000000000000000000000000000..4eef6c70cc71cfc2e22138b0e2513f8e33e88566 --- /dev/null +++ b/csrc/include/attention_ck.h @@ -0,0 +1,24 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +torch::Tensor pa_fwd_naive(torch::Tensor &Q, // [num_seqs, num_heads, head_size] + torch::Tensor &K, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + // or[num_batch, seqlen, num_kv_heads, head_size] + torch::Tensor &V, // [num_blocks, num_kv_heads, head_size, block_size] + // or[num_batch*seqlen, num_kv_heads, head_size] + torch::Tensor &block_tables, + torch::Tensor &context_lens, + torch::Tensor &k_dequant_scales, + torch::Tensor &v_dequant_scales, + const int max_seq_len, + const int num_kv_heads, + const float scale_s, + const float scale_k, + const float scale_v, + const int block_size, + const int quant_algo, + std::optional &out_ + // above are input +); \ No newline at end of file diff --git a/csrc/include/attention_common.cuh b/csrc/include/attention_common.cuh new file mode 100644 index 0000000000000000000000000000000000000000..92865e385d85fc8b4dd0e3d61210d4a84dee362c --- /dev/null +++ b/csrc/include/attention_common.cuh @@ -0,0 +1,1198 @@ +// SPDX-License-Identifier: MIT + #pragma once + +#include +#include "hip_compat.h" + +#include "dtype_fp8.cuh" +#include "quant_utils.cuh" + +#include +#include + +#if defined(NDEBUG) +#undef NDEBUG +#include +#define UNREACHABLE_CODE assert(false); +#define NDEBUG +#else +#define UNREACHABLE_CODE assert(false); +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DIVIDE_ROUND_UP(a, b) (((a) + (b)-1) / (b)) + + +using floatx4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; +using float16x4 = __attribute__((__vector_size__(4 * sizeof(_Float16)))) _Float16; +typedef float16x4 _Half4; +using float16x2 = __attribute__((__vector_size__(2 * sizeof(_Float16)))) _Float16; +typedef float16x2 _Half2; +typedef struct _Half8 +{ + _Half4 xy[2]; +} _Half8; + +using bit16_t = uint16_t; +using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t; +typedef bit16x4 _B16x4; +typedef struct _B16x8 +{ + _B16x4 xy[2]; +} _B16x8; + +using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t; +typedef bit16x8 _B16x8_2; + +using _B8x8 = uint2; +using _B8x4 = int32_t; // used in builtins +using bit8_t = uint8_t; + +typedef struct _B8x16 +{ + _B8x8 xy[2]; +} _B8x16; + +////// Non temporal loads /////// +template +__device__ __forceinline__ T loadnt(T* addr) +{ + return __builtin_nontemporal_load(addr); +} + +__device__ __forceinline__ _B16x8 load_ntmprl_16Byte(const _B16x8* addr) +{ + auto addr_alias = reinterpret_cast(addr); + auto dat0 = loadnt(addr_alias); + auto dat1 = loadnt(addr_alias + 1); + auto dat2 = loadnt(addr_alias + 2); + auto dat3 = loadnt(addr_alias + 3); + auto res = make_float4(dat0, dat1, dat2, dat3); + return *reinterpret_cast<_B16x8*>(&res); +} + +#if defined(__gfx950__) +template +__device__ __forceinline__ floatx4 gcn_mfma16x16x32_instr(const _B16x8& inpA, + const _B16x8& inpB, + const floatx4& inpC) +{ + _B16x8_2 tmpA = __builtin_shufflevector(inpA.xy[0], inpA.xy[1], 0, 1, 2, 3, 4, 5, 6, 7); + _B16x8_2 tmpB = __builtin_shufflevector(inpB.xy[0], inpB.xy[1], 0, 1, 2, 3, 4, 5, 6, 7); + + if constexpr(std::is_same::value) + { + return __builtin_amdgcn_mfma_f32_16x16x32_f16(tmpA, tmpB, inpC, absz, cbid, blgp); + } + else if constexpr(std::is_same::value) + { + return __builtin_amdgcn_mfma_f32_16x16x32_bf16(tmpA, tmpB, inpC, absz, cbid, blgp); + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} +#else +template +__device__ __forceinline__ floatx4 gcn_mfma16x16x16_instr(const _B16x4& inpA, + const _B16x4& inpB, + const floatx4& inpC) +{ + if constexpr(std::is_same::value) + { + return __builtin_amdgcn_mfma_f32_16x16x16f16(inpA, inpB, inpC, absz, cbid, blgp); + } + else if constexpr(std::is_same::value) + { + return __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(inpA, inpB, inpC, absz, cbid, blgp); + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} +#endif + +template +__device__ __forceinline__ float to_float(const T& inp) +{ + if constexpr(std::is_same::value) + { + return (float)inp; + } + else if constexpr(std::is_same::value) + { + return __bfloat162float(inp); + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ float to_float_b16(const bit16_t& inp) +{ + union tmpcvt + { + bit16_t u; + _Float16 f; + __hip_bfloat16 b; + } t16; + t16.u = inp; + if constexpr(std::is_same::value) + { + return (float)t16.f; + } + else if constexpr(std::is_same::value) + { + return __bfloat162float(t16.b); + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ T from_float(const float& inp) +{ + if constexpr(std::is_same::value) + { + return (_Float16)inp; + } + else if constexpr(std::is_same::value) + { + return __float2bfloat16(inp); + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) +{ + union tmpcvt + { + uint16_t u; + _Float16 f; + __hip_bfloat16 b; + } t16; + _B16x4 ret; + if constexpr(std::is_same::value) + { + union h2cvt + { + __half2 h2[2]; + _B16x4 b16x4; + } u; + u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1])); + u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3])); + return u.b16x4; + } + else if constexpr(std::is_same::value) + { + for(int i = 0; i < 4; i++) + { + union fcvt + { + uint32_t u32; + float f32; + } u; + u.f32 = inp[i]; + u.u32 += 0x7fff + ((u.u32 >> 16) & 1); // BF16 RNE with no nan/inf check + ret[i] = uint16_t(u.u32 >> 16); + } + return ret; + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1, const _B16x4& inp2) +{ + union tmpcvt + { + uint16_t u; + _Float16 f; + __hip_bfloat16 b; + } t1, t2, res; + _B16x4 ret; + if constexpr(std::is_same::value) + { + union h2cvt + { + _B16x4 b16x4; + __half2 h2[2]; + } u1, u2, s; + u1.b16x4 = inp1; + u2.b16x4 = inp2; + s.h2[0] = u1.h2[0] + u2.h2[0]; + s.h2[1] = u1.h2[1] + u2.h2[1]; + return s.b16x4; + } + else if constexpr(std::is_same::value) + { + for(int i = 0; i < 4; i++) + { + union fcvt + { + float f32; + uint32_t i32; + } u1, u2, s; + u1.i32 = uint32_t(inp1[i]) << 16; + u2.i32 = uint32_t(inp2[i]) << 16; + s.f32 = u1.f32 + u2.f32; + ret[i] = uint16_t(s.i32 >> 16); + } + return ret; + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} + + +__device__ __forceinline__ floatx4 to_float_fp8x4(const _B8x4& inp) +{ +#if defined(__gfx90a__) + float4 f32x4 = + vllm::fp8::vec_conversion(*reinterpret_cast(&inp)); + return *reinterpret_cast(&f32x4); +#else // MI3xx+ optimized builtins + const auto f0 = __builtin_amdgcn_cvt_pk_f32_fp8(inp, false); + const auto f1 = __builtin_amdgcn_cvt_pk_f32_fp8(inp, true); + floatx4 ret; + ret[0] = f0[0]; + ret[1] = f0[1]; + ret[2] = f1[0]; + ret[3] = f1[1]; + return ret; +#endif +} + +template +__device__ __forceinline__ _B16x4 from_floatx4_rtz(const floatx4& inp) +{ + _B16x4 ret; + if constexpr(std::is_same::value) + { + union h2cvt + { + _Half2 h2[2]; + _B16x4 b16x4; + } u; + u.h2[0] = __builtin_amdgcn_cvt_pkrtz(inp[0], inp[1]); + u.h2[1] = __builtin_amdgcn_cvt_pkrtz(inp[2], inp[3]); + return u.b16x4; + } + else if constexpr(std::is_same::value) + { + for(int i = 0; i < 4; i++) + { + union fcvt + { + uint32_t i32; + float f32; + } u; + u.f32 = inp[i]; + ret[i] = uint16_t(u.i32 >> 16); + } + return ret; + } + else + { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ _B16x8 convert_b8x8_custom(const _B8x8 input) +{ + union + { + _B8x8 b8x8; + _B8x4 b8x4[2]; + } tmp; + tmp.b8x8 = input; + _B16x8 ret; + for(int i = 0; i < 2; i++) + { + ret.xy[i] = from_floatx4_rtz(to_float_fp8x4(tmp.b8x4[i])); + } + return ret; +} + +template +__device__ void _paged_attention_kernel( + const int* block_table_seq, + const int64_t query_loc, + int context_len, + const int partition_start_token_idx, + const scalar_t* q, + const cache_t* k_cache, + const cache_t* v_cache, + const float scale, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, + const int kv_block_stride, + const int kv_head_stride, + const int kv_seq_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + float logits_soft_cap, + float logits_soft_cap_rcp, + const float* k_scale_ptr, + const float* v_scale_ptr, + const AttentionVariant* variant) +{ + const int seq_idx = blockIdx.x; + const int partition_idx = blockIdx.y; + constexpr int T_PAR_SIZE = 256; + constexpr int NWARPS = NUM_THREADS / WARP_SIZE; + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + const int lane4id = laneid % 4; + const int lane16id = laneid % 16; + const int rowid = laneid / 16; + + const int max_num_partitions = gridDim.y; + constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4); + + __shared__ float shared_qk_max[NWARPS][16 + 1]; + __shared__ float shared_exp_sum[NWARPS][16 + 1]; + // shared_logits is used for multiple purposes + __shared__ _B16x4 shared_logits[NWARPS][4][16][4]; + + // for QK mfma16x16, layout is QHead/Tokenx16 across every 16 lanes, 16 Bytes + // HeadElements in each lane, 4x16B HeadElements across 4 rows of warp + constexpr int ROWS_PER_WARP = WARP_SIZE / 16; // rows refers to 16 lanes; refer dpp terminology + constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD = + 16 / sizeof(cache_t); // 8 for 16 bit cache type, 16 for 8 bit types + constexpr int QKHE_PER_FETCH = + CONTIGUOUS_KV_ELEMS_16B_LOAD * + ROWS_PER_WARP; // each fetch across a warp fetches these many elements + constexpr int QK_SIZE_RATIO = + sizeof(scalar_t) / sizeof(cache_t); // 1 for 16bit types, 2 for 8bit types + constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH; // 4xQKHE_16B across warp + + _B16x8 Qlocal[QKHELOOP][QK_SIZE_RATIO]; // note that 16 contiguous elements of Q should + // be fetched per lane for 8 bit cache types : + // QK_SIZE_RATIO changes for this + + constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t); + + constexpr int TOKENS_PER_WARP = + T_PAR_SIZE / NWARPS; // sub partition of tokens per warp for qk calculation + constexpr int TLOOP = TOKENS_PER_WARP / 16; // each mfma16x16x16 instruction processes 16 tokens + + _B16x8 Klocal[TLOOP][QKHELOOP]; // can be interpreted as B8x16 for 8 bit types + + const int wg_start_head_idx = blockIdx.z * GQA_RATIO; + const int wg_start_kv_head_idx = blockIdx.z; + const int total_num_heads = gridDim.z * GQA_RATIO; + + /// NOTICE: We don't support mask for this kernel, so just use a placeholder type/object here. + using Mask = ck_tile::SimplifiedGenericAttentionMask; + const Mask mask{/*seqlen_q=*/1, /*seqlen_k=*/context_len}; + + const auto variant_params = [&] { + if constexpr(AttentionVariant::use_logits_soft_cap) + { + return ck_tile::LogitsSoftCapParams{ + mask, scale, logits_soft_cap, logits_soft_cap_rcp}; + } + else + { + return ck_tile::StandardAttentionParams{mask, scale}; + } + }(); + + // for QK mfma, tokens in multiples of TOKENS_PER_WARP are spread across warps + // each mfma takes QH16xT16x16HE across warp + // repeat mfmas across QKHELOOP dimension + // output layout from QKmfma : QH16xT4x4 16 qheads across 16 lanes, 16 tokens + // across 4 rows x 4 tokens per lane + + const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); + const int last_ctx_block = num_context_blocks - 1; + + int kphysical_block_number[TLOOP]; + + // fetch k physical block numbers + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kblock_idx = + (kglobal_token_idx < context_len) ? kglobal_token_idx / BLOCK_SIZE : last_ctx_block; + kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; + } + + // fetch Q in shared across warps and then write to registers + const int local_qhead_idx = 4 * warpid + rowid; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + + const scalar_t* q_ptr = q + query_loc * q_stride + global_qhead_idx * HEAD_SIZE; + + const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B; + if((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) + { + const scalar_t* q_fetch_ptr = q_ptr + qhead_element; + const _B16x8* q_fetch_ptr_16B = reinterpret_cast(q_fetch_ptr); + _B16x8 tmp = *q_fetch_ptr_16B; + if constexpr(KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) + { + const int offset1 = + lane16id / 4; // 16 contiguous chunks of head elems are spread across 4x4lanes + shared_logits[offset1][lane4id][local_qhead_idx][0] = tmp.xy[0]; + shared_logits[offset1][lane4id][local_qhead_idx][1] = tmp.xy[1]; + } + else + { + for(int i = 0; i < 2; i++) + { + const int head_elem = lane16id * 2 + i; // element id in _B16x4 terms + const int offset3 = head_elem % 4; + const int offset2 = (head_elem / 4) % 4; + const int offset1 = head_elem / 4 / 4; + shared_logits[offset1][offset2][local_qhead_idx][offset3] = tmp.xy[i]; + } + } + } + __syncthreads(); + for(int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) + { + for(int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) + { + for(int i = 0; i < 2; i++) + { + Qlocal[qkhe_depth][qkratio].xy[i] = + shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO][2 * qkratio + i]; + } + } + } + + // set to true to enable non temporal kv loads: has some benefit in very high + // batch size cases + constexpr bool NT_KV_LOAD = false; + + constexpr int KX = 16 / sizeof(cache_t); // vLLM defines x as 16 Bytes of kv cache elements + const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride; + + const int row_head_elem = rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD; + // fetch K values + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + const int64_t kblock_number = static_cast(kphysical_block_number[token_depth]); + const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; + const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; + const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * kv_seq_stride; + + for(int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) + { + const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH; + const int offset1 = head_elem / KX; + const int offset2 = head_elem % KX; + const cache_t* k_fetch_ptr = k_ptr3 + offset1 * KX + offset2; + const _B16x8* k_fetch_ptr_16B = reinterpret_cast(k_fetch_ptr); + if constexpr(NT_KV_LOAD) + { + Klocal[token_depth][qkhe_depth] = load_ntmprl_16Byte(k_fetch_ptr_16B); + } + else + { + Klocal[token_depth][qkhe_depth] = *k_fetch_ptr_16B; + } + } + } + + float alibi_slope; + if constexpr(ALIBI_ENABLED) + { + const int alibi_head_idx = wg_start_head_idx + lane16id; + alibi_slope = (lane16id < GQA_RATIO) ? alibi_slopes[alibi_head_idx] : 0.f; + } + + constexpr int n_thread_per_warp = (NWARPS * 16) / CONTIGUOUS_KV_ELEMS_16B_LOAD; // 8 + constexpr int k_thread_per_warp = WARP_SIZE / n_thread_per_warp; // 8 + constexpr int n_thread_per_block = n_thread_per_warp; // 8 + constexpr int k_thread_per_block = NWARPS * k_thread_per_warp; // 32 + constexpr int k_repeat = TOKENS_PER_WARP / k_thread_per_block; // 2 + static_assert(BLOCK_SIZE <= k_thread_per_block); + + constexpr int VTOKENS_PER_LANE = + TOKENS_PER_WARP / ROWS_PER_WARP; // 64/4 = 16 contiguous vtokens per lane + constexpr int VBLOCKS_PER_LANE = k_repeat; // assumes block size <= 32 + constexpr int VTLOOP = NWARPS; // corresponds to tokens across warps + constexpr int VTLANELOOP = + DIVIDE_ROUND_UP(VTOKENS_PER_LANE, + CONTIGUOUS_KV_ELEMS_16B_LOAD); // optimized for 16B fetches; assumes + // minimum block size is 16 + constexpr int VHELOOP = HEAD_SIZE / 16 / NWARPS; // head_size distributed across warps; each + // mfma instr works on 16 head elements + + int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE]; + + // fetch v physical block numbers + for(int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) + { + for(int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE; vblock_depth++) + { + const int vlocal_token_idx = vtoken_depth * TOKENS_PER_WARP + + vblock_depth * k_thread_per_block + + threadIdx.x / n_thread_per_block; + const int vglobal_token_idx = partition_start_token_idx + vlocal_token_idx; + const int vblock_idx = + (vglobal_token_idx < context_len) ? vglobal_token_idx / BLOCK_SIZE : last_ctx_block; + vphysical_block_number[vtoken_depth][vblock_depth] = block_table_seq[vblock_idx]; + } + } + + _B16x8 Vlocal[VTLOOP][VHELOOP][VTLANELOOP]; // this can be interpreted as B8x16 too + __shared__ unsigned char vlds_ptr[TOKENS_PER_WARP * n_thread_per_block * 16]; + static_assert(VBLOCKS_PER_LANE == VTLANELOOP, + "make sure we can keep un-shuffled data in Vlocal as well"); + + const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride + + ((threadIdx.x / n_thread_per_block) % BLOCK_SIZE) * kv_seq_stride; + + // v fetches are 16head elems across lanes x 16 tokens per lane + for(int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) + { + for(int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) + { + for(int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE; vblock_depth++) + { + const int vlds_col_idx = laneid % n_thread_per_block; + const int vhead_elem = + vhe_depth * NWARPS * 16 + vlds_col_idx * CONTIGUOUS_KV_ELEMS_16B_LOAD; + const cache_t* v_ptr2 = v_ptr + vhead_elem; + + const int64_t vblock_number = + static_cast(vphysical_block_number[vtoken_depth][vblock_depth]); + const cache_t* v_fetch_ptr = v_ptr2 + (vblock_number * kv_block_stride); + + Vlocal[vtoken_depth][vhe_depth][vblock_depth] = + *reinterpret_cast(v_fetch_ptr); + } + } + } + + // calculate post qk mfma scale + float scale2 = scale; + if constexpr(KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) + { + // multiply by k_scale if fp8 kv cache + scale2 *= *k_scale_ptr; + } + + floatx4 dout[TLOOP]; + // qk mfma + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + dout[token_depth] = {0}; + for(int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) + { + if constexpr(KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) + { + for(int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) + { +#if defined(__gfx950__) + dout[token_depth] = gcn_mfma16x16x32_instr( + Klocal[token_depth][qkhe_depth], + Qlocal[qkhe_depth][qkratio], + dout[token_depth]); +#else + for(int i = 0; i < 2; i++) + { + dout[token_depth] = gcn_mfma16x16x16_instr( + Klocal[token_depth][qkhe_depth].xy[i], + Qlocal[qkhe_depth][qkratio].xy[i], + dout[token_depth]); + } +#endif + } + } + else + { // kv cache dtype fp8 + auto Ktmp = Klocal[token_depth][qkhe_depth]; + _B8x16 Ktmp8x16 = *reinterpret_cast<_B8x16*>(&Ktmp); + for(int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) + { + _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio]; + _B16x8 Klocaltmp = convert_b8x8_custom(Ktmp8x8); +#if defined(__gfx950__) + dout[token_depth] = gcn_mfma16x16x32_instr( + Klocaltmp, + Qlocal[qkhe_depth][qkratio], + dout[token_depth]); +#else + for(int i = 0; i < 2; i++) + { + dout[token_depth] = gcn_mfma16x16x16_instr( + Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i], dout[token_depth]); + } +#endif + } + } + } + for(int i = 0; i < 4; i++) + { + dout[token_depth][i] = variant->QueryTransform(variant_params, dout[token_depth][i]); + } + } + + const int qkout_token_idx = partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid * 4; + + // apply alibi + if constexpr(ALIBI_ENABLED) + { + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + const int local_token_idx = qkout_token_idx + token_depth * 16; + const int alibi_offset = local_token_idx - context_len + 1; + for(int i = 0; i < 4; i++) + { + dout[token_depth][i] += alibi_slope * (alibi_offset + i); + } + } + } + // apply soft-capping to logits + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + for(int i = 0; i < 4; i++) + { + dout[token_depth][i] = + variant->LogitsTransform(variant_params, + dout[token_depth][i], + /*batch_idx=*/blockIdx.x, + /*qo_head_idx=*/wg_start_head_idx + lane16id, + /*kv_head_idx=*/blockIdx.z); + } + } + + // calculate qk_max and exp_sum per warp and write to shared memory + float qk_max = -FLT_MAX; + float exp_sum = 0.0f; + + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for(int i = 0; i < 4; i++) + { + const float tmp = (local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX; + qk_max = fmaxf(qk_max, tmp); + } + } + + for(int mask = WARP_SIZE / 2; mask >= 16; mask /= 2) + { + qk_max = fmaxf(qk_max, __shfl_xor(qk_max, mask)); + } + + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for(int i = 0; i < 4; i++) + { + const float tmp = + (local_token_idx + i < context_len) ? __expf(dout[token_depth][i] - qk_max) : 0.0f; + dout[token_depth][i] = tmp; + exp_sum += tmp; + } + } + + for(int mask = WARP_SIZE / 2; mask >= 16; mask /= 2) + { + exp_sum += __shfl_xor(exp_sum, mask); + } + + __syncthreads(); // sync before writing to shared mem + + float* shared_mem = reinterpret_cast(shared_logits); + if(laneid < 16) + { + const int qk_max_offset = warpid * 16 + lane16id; + shared_mem[qk_max_offset] = qk_max; + const int exp_sum_offset = NWARPS * 16 + qk_max_offset; + shared_mem[exp_sum_offset] = exp_sum; + } + + __syncthreads(); + + // calculate partition qk_max and exp_sum + float partition_qk_max = -FLT_MAX; + float warp_qk_max_exp[NWARPS]; + float partition_exp_sum = 0.0f; + + for(int w = 0; w < NWARPS; w++) + { + warp_qk_max_exp[w] = shared_mem[w * 16 + lane16id]; + partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]); + } + + for(int w = 0; w < NWARPS; w++) + { + warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max); + partition_exp_sum += shared_mem[NWARPS * 16 + w * 16 + lane16id] * warp_qk_max_exp[w]; + } + + const float inv_sum_scale = + __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid]; + + __syncthreads(); + + // write logits to shared mem + for(int token_depth = 0; token_depth < TLOOP; token_depth++) + { + dout[token_depth] *= inv_sum_scale; + // use rtz conversion for performance, with no visible impact on accuracy + shared_logits[warpid][token_depth][lane16id][rowid] = + from_floatx4_rtz(dout[token_depth]); + } + // write out partition max_logits and exp_sum + if(threadIdx.x < GQA_RATIO) + { + const int qhead_idx = lane16id; + const int offset = seq_idx * total_num_heads * max_num_partitions + + (wg_start_head_idx + qhead_idx) * max_num_partitions + partition_idx; + max_logits[offset] = partition_qk_max; + exp_sums[offset] = partition_exp_sum; + } + + __syncthreads(); + + constexpr int ELEMS8_ELEMS4_RATIO = 8 / 4; + constexpr int ELEMS16_ELEMS8_RATIO = 16 / 8; + + _B16x4 outelems[VHELOOP]; + // Softmax V mfma + // v layout: 16he across lanes x 16 tokens per lane + for(int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) + { + floatx4 tmp_out = {0}; + + for(int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) + { + // 1. store data into LDS + for(int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE; vblock_depth++) + { + const int vlds_col_idx = laneid % n_thread_per_block; + const int vlocal_token_idx = + vblock_depth * k_thread_per_block + threadIdx.x / n_thread_per_block; + *reinterpret_cast<_B16x8*>(vlds_ptr + + (/*row=*/vlocal_token_idx * n_thread_per_block + + /*col=*/vlds_col_idx) * + 16) = Vlocal[vtoken_depth][vhe_depth][vblock_depth]; + } + __syncthreads(); + + // 2. load data from LDS (transposed), then do multification + if constexpr(KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) + { + for(int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) + { + { + const int vlocal_head_elem = warpid * 16 + lane16id; + + const int vlds_col_idx = vlocal_head_elem / CONTIGUOUS_KV_ELEMS_16B_LOAD; + const int vlds_elem_idx = vlocal_head_elem % CONTIGUOUS_KV_ELEMS_16B_LOAD; + + const int vlocal_token_idx = + rowid * VTOKENS_PER_LANE + vfetch_depth * CONTIGUOUS_KV_ELEMS_16B_LOAD; + + // read data points individually and save them into array + cache_t elems[CONTIGUOUS_KV_ELEMS_16B_LOAD]; + for(int d2 = 0; d2 < CONTIGUOUS_KV_ELEMS_16B_LOAD; ++d2) + { + const cache_t* fetched_elems = reinterpret_cast( + vlds_ptr + (/*row=*/(vlocal_token_idx + d2) * n_thread_per_block + + /*col=*/vlds_col_idx) * + 16); + + elems[d2] = fetched_elems[vlds_elem_idx]; + } + + // copy all the read data points together + Vlocal[vtoken_depth][vhe_depth][vfetch_depth] = + *reinterpret_cast(elems); + } +#if defined(__gfx950__) + assert(ELEMS8_ELEMS4_RATIO == 2); + _B16x8 tmp_in; + for(int i = 0; i < 2; i++) + { + const int offset = rowid * VTLANELOOP * ELEMS8_ELEMS4_RATIO + + vfetch_depth * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + tmp_in.xy[i] = shared_logits[vtoken_depth][offset2][lane16id][offset1]; + } + // output format is 16 qheads across 16 lanes, 16 head elems spread + // across 4 rows + tmp_out = gcn_mfma16x16x32_instr( + Vlocal[vtoken_depth][vhe_depth][vfetch_depth], + tmp_in, + tmp_out); +#else + for(int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) + { + const int offset = rowid * VTLANELOOP * ELEMS8_ELEMS4_RATIO + + vfetch_depth * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + // output format is 16 qheads across 16 lanes, 16 head elems spread + // across 4 rows + tmp_out = gcn_mfma16x16x16_instr( + Vlocal[vtoken_depth][vhe_depth][vfetch_depth].xy[i], + shared_logits[vtoken_depth][offset2][lane16id][offset1], + tmp_out); + } +#endif + } + // KV cache fp8 + } + else + { + for(int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) + { + _B16x8 Vtmp = Vlocal[vtoken_depth][vhe_depth][vfetch_depth]; + // reinterpret V format as 16 elements of 8bits + _B8x16 Vtmp8x16 = *reinterpret_cast<_B8x16*>(&Vtmp); + for(int j = 0; j < ELEMS16_ELEMS8_RATIO; j++) + { + _B8x8 Vtmp8x8 = Vtmp8x16.xy[j]; + _B16x8 Vlocaltmp = convert_b8x8_custom(Vtmp8x8); +#if defined(__gfx950__) + assert(ELEMS8_ELEMS4_RATIO == 2); + _B16x8 tmp_in; + for(int i = 0; i < 2; i++) + { + const int offset = rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + + j * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + tmp_in.xy[i] = shared_logits[vtoken_depth][offset2][lane16id][offset1]; + } + // output format is 16 qheads across 16 lanes, 16 head elems + // spread across 4 rows + tmp_out = gcn_mfma16x16x32_instr( + Vlocaltmp, + tmp_in, + tmp_out); +#else + for(int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) + { + const int offset = rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO + + j * ELEMS8_ELEMS4_RATIO + i; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + // output format is 16 qheads across 16 lanes, 16 head elems + // spread across 4 rows + tmp_out = gcn_mfma16x16x16_instr( + Vlocaltmp.xy[i], + shared_logits[vtoken_depth][offset2][lane16id][offset1], + tmp_out); + } +#endif + } + } + } + __syncthreads(); + } + // apply post Softmax V mfma v_scale + if constexpr(KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) + { + tmp_out *= *v_scale_ptr; + } + outelems[vhe_depth] = from_floatx4(tmp_out); + } + + __syncthreads(); + + // store Softmax-V mfma output to shared mem + for(int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) + { + // lane16 id head dimension; rowid head element dimension + shared_logits[warpid][vhe_depth][lane16id][rowid] = outelems[vhe_depth]; + } + + __syncthreads(); + + // write to tmp_out with coalesced writes after reading from shared mem + if(warpid == 0) + { + _B16x8 vout[GQA_RATIO4]; + // each lane writes out 16Bytes of tmp_out along head elem dimension + const int head_elem_idx = lane16id * 8; + if(head_elem_idx < HEAD_SIZE) + { + for(int h = 0; h < GQA_RATIO4; h++) + { + const int local_head_idx = 4 * h + rowid; + const int offset1 = (head_elem_idx / 16) % 4; + const int offset2 = head_elem_idx / 16 / NWARPS; + const int offset3 = (head_elem_idx / 4) % 4; + for(int i = 0; i < 2; i++) + { + vout[h].xy[i] = shared_logits[offset1][offset2][local_head_idx][offset3 + i]; + } + } + + const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions; + scalar_t* out_ptr = + out + seq_idx * total_num_heads * hsz_maxp_mult + partition_idx * HEAD_SIZE; + for(int h = 0; h < GQA_RATIO4; h++) + { + const int local_head_idx = 4 * h + rowid; + if(local_head_idx < GQA_RATIO) + { + const int out_head_idx = wg_start_head_idx + local_head_idx; + scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult; + scalar_t* out_ptr3 = out_ptr2 + head_elem_idx; + _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3); + *out_ptr_B16x8 = vout[h]; + } + } + } + } +} + +template +__device__ void _paged_attention_ll4mi_reduce_kernel( + const int64_t query_loc, + int context_len, + OUTT* __restrict__ out, // [num_seqs, num_heads, head_size] + const float* __restrict__ exp_sums, // [num_seqs, num_heads, + // max_num_partitions] + const float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const int max_num_partitions, + const float* __restrict__ fp8_out_scale_ptr +){ + const int num_heads = gridDim.x; + const int head_idx = blockIdx.x; + const int seq_idx = blockIdx.y; + + const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + + __shared__ float shared_global_exp_sum; + // max num partitions supported is warp_size * NPAR_LOOPS + __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE]; + + if(warpid == 0) + { + const float* max_logits_ptr = + max_logits + seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions; + + // valid partition is the last valid partition in case threadid > num + // partitions + int valid_partition[NPAR_LOOPS]; + float reg_max_logit[NPAR_LOOPS]; + const int last_valid_partition = num_partitions - 1; + +#pragma unroll + for(int i = 0; i < NPAR_LOOPS; i++) + { + const int partition_no = i * WARP_SIZE + threadIdx.x; + valid_partition[i] = + (partition_no < num_partitions) ? partition_no : last_valid_partition; + } +#pragma unroll + for(int i = 0; i < NPAR_LOOPS; i++) + { + reg_max_logit[i] = max_logits_ptr[valid_partition[i]]; + } + float max_logit = reg_max_logit[0]; +#pragma unroll + for(int i = 1; i < NPAR_LOOPS; i++) + { + max_logit = fmaxf(max_logit, reg_max_logit[i]); + } + +#pragma unroll + for(int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) + { + max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask)); + } + + const float* exp_sums_ptr = + exp_sums + seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions; + + float rescaled_exp_sum[NPAR_LOOPS]; +#pragma unroll + for(int i = 0; i < NPAR_LOOPS; i++) + { + rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]]; + } +#pragma unroll + for(int i = 0; i < NPAR_LOOPS; i++) + { + const int partition_no = i * WARP_SIZE + threadIdx.x; + rescaled_exp_sum[i] *= + (partition_no < num_partitions) ? expf(reg_max_logit[i] - max_logit) : 0.0f; + } + float global_exp_sum = rescaled_exp_sum[0]; +#pragma unroll + for(int i = 1; i < NPAR_LOOPS; i++) + { + global_exp_sum += rescaled_exp_sum[i]; + } +#pragma unroll + for(int i = 0; i < NPAR_LOOPS; i++) + { + const int partition_no = i * WARP_SIZE + threadIdx.x; + shared_exp_sums[partition_no] = rescaled_exp_sum[i]; + } + +#pragma unroll + for(int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) + { + global_exp_sum += __shfl_xor(global_exp_sum, mask); + } + if(threadIdx.x == 0) + { + shared_global_exp_sum = global_exp_sum; + } + } // warpid == 0 + const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x; + constexpr int MAX_NPAR = 64; + scalar_t tmps[MAX_NPAR]; + const float dzero = 0.0f; +#pragma unroll + for(int j = 0; j < MAX_NPAR; j++) + { + tmps[j] = from_float(dzero); + } + const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE; + const int num_partition_offset = (num_partitions)*HEAD_SIZE; + int idx = 0; + + constexpr int JCHUNK = 16; + +#pragma unroll + for(int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) + { + // lastj is last valid partition + const int lastj_offset = (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + __syncthreads(); + + if(num_partitions > JCHUNK) + { +#pragma unroll + for(int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE; j += HEAD_SIZE) + { + const int lastj_offset = (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + if(num_partitions > 2 * JCHUNK) + { +#pragma unroll + for(int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE; j += HEAD_SIZE) + { + const int lastj_offset = (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + } + } // num_partitions > JCHUNK + + // Aggregate tmp_out to out. + float acc = 0.0f; +#pragma unroll + for(int j = 0; j < JCHUNK; j++) + { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if(num_partitions > JCHUNK) + { +#pragma unroll + for(int j = JCHUNK; j < 2 * JCHUNK; j++) + { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if(num_partitions > 2 * JCHUNK) + { +#pragma unroll + for(int j = 2 * JCHUNK; j < MAX_NPAR; j++) + { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + } + } + + for(int p = 1; p < NPAR_LOOPS; p++) + { + if(num_partitions > p * MAX_NPAR) + { + idx = 0; +#pragma unroll + for(int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) + { + // lastj is last valid partition + const int lastj_offset = (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + +#pragma unroll + for(int j = 0; j < MAX_NPAR; j++) + { + acc += to_float(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR]; + } + } + } + + const float inv_global_exp_sum = __fdividef(1.0f, shared_global_exp_sum + 1e-6f); + const float out_scale = (fp8_out_scale_ptr != nullptr) ? 1.0f / (*fp8_out_scale_ptr) : 1.0f; + acc *= inv_global_exp_sum; + acc *= out_scale; + OUTT* out_ptr = out + query_loc * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; + if constexpr(std::is_same::value) + { + out_ptr[threadIdx.x] = hip_fp8(acc).data; + } + else + { + out_ptr[threadIdx.x] = from_float(acc); + } +} diff --git a/csrc/include/attention_dtypes.h b/csrc/include/attention_dtypes.h new file mode 100644 index 0000000000000000000000000000000000000000..c800b3d773d1068a0c8c017b0d0a102a4de190eb --- /dev/null +++ b/csrc/include/attention_dtypes.h @@ -0,0 +1,8 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include "attention_generic.cuh" +#include "dtype_float16.cuh" +#include "dtype_float32.cuh" +#include "dtype_bfloat16.cuh" +#include "dtype_fp8.cuh" \ No newline at end of file diff --git a/csrc/include/attention_generic.cuh b/csrc/include/attention_generic.cuh new file mode 100644 index 0000000000000000000000000000000000000000..191474ae43c865714e24699cfabae1da011738fb --- /dev/null +++ b/csrc/include/attention_generic.cuh @@ -0,0 +1,64 @@ +/* + + * Adapted from + * Copyright (c) 2023, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace vllm { + +// A vector type to store Q, K, V elements. +template +struct Vec {}; + +// A vector type to store FP32 accumulators. +template +struct FloatVec {}; + +// Template vector operations. +template +inline __device__ Acc mul(A a, B b); + +template +inline __device__ float sum(T v); + +template +inline __device__ float dot(T a, T b) { + return sum(mul(a, b)); +} + +template +inline __device__ float dot(T a, T b) { + return sum(mul(a, b)); +} + +template +inline __device__ void zero(T& dst) { + constexpr int WORDS = sizeof(T) / 4; + union { + T raw; + uint32_t words[WORDS]; + } tmp; + +#pragma unroll + for (int ii = 0; ii < WORDS; ++ii) { + tmp.words[ii] = 0u; + } + dst = tmp.raw; +} + +} // namespace vllm diff --git a/csrc/include/attention_ragged.h b/csrc/include/attention_ragged.h new file mode 100644 index 0000000000000000000000000000000000000000..8f27117553acda4bd10ec7b18fe3b4a6c71879bc --- /dev/null +++ b/csrc/include/attention_ragged.h @@ -0,0 +1,27 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +void paged_attention_ragged( + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& workspace_buffer, + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& key_cache, // [num_blocks, num_heads, block_size, head_size] or + // [num_blocks, block_size, num_heads, head_size] + torch::Tensor& value_cache, // [num_blocks, num_heads, block_size, head_size] or + // [num_blocks, block_size, num_heads, head_size] + double scale, + torch::Tensor& kv_indptr, // [num_seqs + 1] + torch::Tensor& kv_page_indices, // [max_num_blocks] + std::optional& kv_last_page_lens, // [num_seqs] + int64_t block_size, + int64_t max_num_partitions, + const std::optional& alibi_slopes, + const std::string& kv_cache_dtype, + const std::string& kv_cache_layout, + float logits_soft_cap, + torch::Tensor& k_scale, + torch::Tensor& v_scale, + const std::optional& fp8_out_scale, + int64_t partition_size); diff --git a/csrc/include/awq_dq_asm.h b/csrc/include/awq_dq_asm.h new file mode 100644 index 0000000000000000000000000000000000000000..620a4261b50ca5da2d6552c3495d847764001ca8 --- /dev/null +++ b/csrc/include/awq_dq_asm.h @@ -0,0 +1,13 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include +#include "aiter_enum.h" + + +void awq_dq_asm(torch::Tensor &out, + torch::Tensor &mat1, + std::optional &zero, + std::optional &scalar +); + diff --git a/csrc/include/awq_gemm_asm.h b/csrc/include/awq_gemm_asm.h new file mode 100644 index 0000000000000000000000000000000000000000..13e8e82466ec0ad06f74e93892c3712d9a3ef719 --- /dev/null +++ b/csrc/include/awq_gemm_asm.h @@ -0,0 +1,23 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include +#include "aiter_enum.h" + + +void awq_gemm_asm(torch::Tensor &out, + torch::Tensor &mat1, + torch::Tensor &mat2, + std::optional &zero, + std::optional &scalar +); + +void awq_gemm_asm_tuning(torch::Tensor &out, + torch::Tensor &mat1, + torch::Tensor &mat2, + std::optional &zero, + std::optional &scalar, + int solutionid, std::string& jsonfile +); + + diff --git a/csrc/include/binary_operator.cuh b/csrc/include/binary_operator.cuh new file mode 100644 index 0000000000000000000000000000000000000000..77de4a96caf06e70b56d919a42f0f380109cb185 --- /dev/null +++ b/csrc/include/binary_operator.cuh @@ -0,0 +1,1926 @@ +/* + * Copyright (C) 2024-2025, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include "hip_compat.h" +#include "dispatch_utils.h" +#include + +#include +#include +typedef __hip_bfloat16 nv_bfloat16; + +namespace aiter +{ + template + inline __device__ T performOperation(T a, T b); + + template + torch::Tensor aten_compute(torch::Tensor &input, torch::Tensor &other); + + struct AddOp + { + template + inline __device__ static T apply(T a, T b) { return a + b; } + + static torch::Tensor compute(torch::Tensor &input, torch::Tensor &other) + { + return torch::add(input, other); + } + }; + + struct SubOp + { + template + inline __device__ static T apply(T a, T b) + { + return a - b; + } + + static torch::Tensor compute(torch::Tensor &input, torch::Tensor &other) + { + return torch::sub(input, other); + } + }; + + struct MulOp + { + template + inline __device__ static T apply(T a, T b) { return a * b; } + + static torch::Tensor compute(torch::Tensor &input, torch::Tensor &other) + { + return torch::mul(input, other); + } + }; + + struct DivOp + { + template + inline __device__ static T apply(T a, T b) + { + // assert(b == static_cast(0)); + return a / b; + } + + static torch::Tensor compute(torch::Tensor &input, torch::Tensor &other) + { + return torch::div(input, other); + } + }; + + template + inline __device__ T performOperation(T a, T b) + { + if constexpr (std::is_same_v) + { + return Operation::apply(a, b); + } + else if constexpr (std::is_same_v) + { + if constexpr (!order_flag) + { + return Operation::apply(b, a); + } + else + { + return Operation::apply(a, b); + } + } + else if constexpr (std::is_same_v) + { + return Operation::apply(a, b); + } + else if constexpr (std::is_same_v) + { + if constexpr (!order_flag) + { + return Operation::apply(b, a); + } + else + { + return Operation::apply(a, b); + } + } + else + { + static_assert(false, "Unsupported operation"); + } + } + template + torch::Tensor aten_compute(torch::Tensor &input, torch::Tensor &other) + { + if constexpr (std::is_same_v) + { + return Operation::compute(input, other); + } + else if constexpr (std::is_same_v) + { + return Operation::compute(input, other); + } + else if constexpr (std::is_same_v) + { + return Operation::compute(input, other); + } + else if constexpr (std::is_same_v) + { + return Operation::compute(input, other); + } + else + { + static_assert(false, "Unsupported operation"); + } + } + + template + __global__ void operator_tn_big_tile_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int N, const int K, int stride0, int stride2, bool types_match) + { + // pad LDS row by dword + constexpr uint32_t LDS_PAD = 4 / sizeof(_T); + constexpr uint32_t element_size = sizeof(_T); // in bytes + constexpr uint32_t elements_in_16B = 16 / element_size; + + union BLOCK_16B + { + _T e[elements_in_16B]; + __uint128_t ow; + }; + + // Round up processing to next full tile + const uint32_t n_tiles = (N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N; + const uint32_t k_tiles = (K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K; + const uint32_t nk_tiles = n_tiles * k_tiles; + const uint32_t m_tiles = gridDim.x / nk_tiles; + const uint32_t m_tile_swizzle = blockIdx.x / nk_tiles / M_SWIZZLE * M_SWIZZLE; + /// do m_swizzle when there are enough m_tiles + const bool swizzle_m = m_tile_swizzle + M_SWIZZLE <= m_tiles; + const uint32_t current_m = swizzle_m ? m_tile_swizzle + blockIdx.x % M_SWIZZLE : blockIdx.x / nk_tiles; + + const uint64_t stride_k = N; + const uint64_t out_stride_nk = N * K; + + const uint32_t current_nk = swizzle_m ? blockIdx.x / M_SWIZZLE % nk_tiles : blockIdx.x % nk_tiles; + const uint32_t ti = current_nk / k_tiles; + const uint32_t tj = current_nk % k_tiles; + + __shared__ _T0 sa[BIG_TILE_SIZE_N][BIG_TILE_SIZE_K + LDS_PAD]; + + const uint32_t current_n_size = (ti == (n_tiles - 1) && (N % BIG_TILE_SIZE_N) != 0) ? (N % BIG_TILE_SIZE_N) : BIG_TILE_SIZE_N; + const uint32_t current_k_size = (tj == (k_tiles - 1) && (K % BIG_TILE_SIZE_K) != 0) ? (K % BIG_TILE_SIZE_K) : BIG_TILE_SIZE_K; + // use 128bit load&store whenever possible + if (current_n_size % elements_in_16B == 0 && current_k_size % 8 == 0) + { + // Copy full tile with large loads + constexpr uint32_t row_bytes = BIG_TILE_SIZE_K; + constexpr uint32_t ld_per_row = row_bytes / elements_in_16B; + constexpr uint32_t rows_per_wg = _WG / ld_per_row; + constexpr uint32_t vmem_per_thread = BIG_TILE_SIZE_N / rows_per_wg; + // Make sure WG isn't too large + static_assert(vmem_per_thread >= 1); + + const _T0 *pat = (const _T0 *)a + tj * row_bytes + ti * BIG_TILE_SIZE_N * stride2 + current_m * stride0; +#pragma unroll + for (uint32_t t = 0; t < vmem_per_thread; t++) + { + uint32_t col = threadIdx.x % ld_per_row; + uint32_t row = threadIdx.x / ld_per_row + t * rows_per_wg; + uint64_t offset = (col * elements_in_16B < current_k_size && row < current_n_size) ? row * stride2 + col * elements_in_16B : 0; + const _T0 *pfa = (const _T0 *)(pat + offset); + // BLOCK_16B d; + // d.ow = *pfa; +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + sa[row][col * elements_in_16B + i] = pfa[i]; + } + } + __syncthreads(); + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = row_bytes_wr / elements_in_16B; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + // Make sure WG isn't too large + static_assert(wr_per_row >= 1); + + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + _T *pc = (_T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < vmem_per_thread; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col * elements_in_16B < current_n_size && row < current_k_size) + { + uint64_t offset = row * stride_k + col * elements_in_16B; + BLOCK_16B d; + if (types_match) + { + const __uint128_t *pfb = (const __uint128_t *)(pb + offset); + d.ow = *pfb; +// Transpose tile on read from LDS +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + d.e[i] = performOperation<_T, Operation, order_flag>(static_cast<_T>(sa[col * elements_in_16B + i][row]), d.e[i]); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + else + { + const _T1 *pfb = (const _T1 *)(pb + offset); +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + float a = static_cast(sa[col * elements_in_16B + i][row]); + float b = static_cast(pfb[i]); + float c = performOperation(a, b); + d.e[i] = static_cast<_T>(c); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + } + } + } + else + { + // Copy partial tiles with element accesses + constexpr uint32_t row_bytes = BIG_TILE_SIZE_K; + constexpr uint32_t ld_per_row = BIG_TILE_SIZE_K; + constexpr uint32_t rows_per_wg = _WG / ld_per_row; + constexpr uint32_t vmem_per_thread = BIG_TILE_SIZE_N / rows_per_wg; + // Make sure WG isn't too large + static_assert(vmem_per_thread >= 1); + + const _T0 *pat = (const _T0 *)a + ti * BIG_TILE_SIZE_N * stride2 + tj * row_bytes + current_m * stride0; +#pragma unroll + for (uint32_t t = 0; t < vmem_per_thread; t++) + { + uint32_t col = threadIdx.x % ld_per_row; + uint32_t row = threadIdx.x / ld_per_row + t * rows_per_wg; + uint64_t offset = (col < current_k_size && row < current_n_size) ? row * stride2 + col : 0; + const _T0 *pfa = (const _T0 *)(pat + offset); + sa[row][col] = *pfa; + } + __syncthreads(); + + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = BIG_TILE_SIZE_N; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + _T *pc = (_T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < wr_per_row; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col < current_n_size && row < current_k_size) + { + uint64_t offset = row * stride_k + col; + const _T1 *pfb = (const _T1 *)(pb + offset); + _T *pfc = (_T *)(pc + offset); + if (types_match) + { + *pfc = performOperation<_T, Operation, order_flag>(static_cast<_T>(sa[col][row]), static_cast<_T>(*pfb)); + } + else + { + float a = static_cast(sa[col][row]); + float b = static_cast(*pfb); + float c = performOperation(a, b); + *pfc = static_cast<_T>(c); + } + } + } + } + } + + template + __global__ void operator_bcast_big_tile_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int N, const int K, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); // in bytes + constexpr uint32_t elements_in_16B = 16 / element_size; + + union BLOCK_16B + { + _T e[elements_in_16B]; + __uint128_t ow; + }; + + // Round up processing to next full tile + const uint32_t n_tiles = (N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N; + const uint32_t k_tiles = (K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K; + const uint32_t nk_tiles = n_tiles * k_tiles; + const uint32_t m_tiles = gridDim.x / nk_tiles; + const uint32_t m_tile_swizzle = blockIdx.x / nk_tiles / M_SWIZZLE * M_SWIZZLE; + /// do m_swizzle when there are enough m_tiles + const bool swizzle_m = m_tile_swizzle + M_SWIZZLE <= m_tiles; + const uint32_t current_m = swizzle_m ? m_tile_swizzle + blockIdx.x % M_SWIZZLE : blockIdx.x / nk_tiles; + + const uint64_t stride_k = N; + const uint64_t out_stride_nk = N * K; + + const uint32_t current_nk = swizzle_m ? blockIdx.x / M_SWIZZLE % nk_tiles : blockIdx.x % nk_tiles; + const uint32_t ti = current_nk / k_tiles; + const uint32_t tj = current_nk % k_tiles; + + const uint32_t current_n_size = (ti == (n_tiles - 1) && (N % BIG_TILE_SIZE_N) != 0) ? (N % BIG_TILE_SIZE_N) : BIG_TILE_SIZE_N; + const uint32_t current_k_size = (tj == (k_tiles - 1) && (K % BIG_TILE_SIZE_K) != 0) ? (K % BIG_TILE_SIZE_K) : BIG_TILE_SIZE_K; + + // use 128bit load&store whenever possible + if (current_n_size % 8 == 0 && current_k_size % elements_in_16B == 0) + { + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = row_bytes_wr / elements_in_16B; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + // Make sure WG isn't too large + static_assert(wr_per_row >= 1); + + const _T0 *pa = (const _T0 *)a + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr; + const _T *pc = (const _T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < wr_per_row; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col * elements_in_16B < current_n_size && row < current_k_size) + { + BLOCK_16B d, f; + uint64_t offset = row * stride_k + col * elements_in_16B; + if (types_match) + { + const __uint128_t *pfa = (const __uint128_t *)(pa + offset); + const __uint128_t *pfb = (const __uint128_t *)(pb + offset); + f.ow = *pfa; + d.ow = *pfb; +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + d.e[i] = performOperation<_T, Operation, order_flag>(static_cast<_T>(f.e[i]), static_cast<_T>(d.e[i])); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + else + { + const _T0 *pfa = (const _T0 *)(pa + offset); + const _T1 *pfb = (const _T1 *)(pb + offset); +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + float a = static_cast(pfa[i]); + float b = static_cast(pfb[i]); + float c = performOperation(a, b); + d.e[i] = static_cast<_T>(c); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + } + } + } + else + { + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = BIG_TILE_SIZE_N; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + const _T0 *pa = (const _T0 *)a + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr; + const _T *pc = (const _T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < wr_per_row; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col < current_n_size && row < current_k_size) + { + uint64_t offset = row * stride_k + col; + const _T0 *pfa = (const _T0 *)(pa + offset); + const _T1 *pfb = (const _T1 *)(pb + offset); + _T *pfc = (_T *)(pc + offset); + if (types_match) + { + *pfc = performOperation<_T, Operation, order_flag>(static_cast<_T>(*pfa), static_cast<_T>(*pfb)); + } + else + { + float a = static_cast(*pfa); + float b = static_cast(*pfb); + float c = performOperation(a, b); + *pfc = static_cast<_T>(c); + } + } + } + } + } + + template + __global__ void operator_bcast1_big_tile_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int N, const int K, bool types_match) + { + // pad LDS row by dword + constexpr uint32_t element_size = sizeof(_T); // in bytes + constexpr uint32_t elements_in_16B = 16 / element_size; + + union BLOCK_16B + { + _T e[elements_in_16B]; + __uint128_t ow; + }; + + // Round up processing to next full tile + const uint32_t n_tiles = (N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N; + const uint32_t k_tiles = (K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K; + const uint32_t nk_tiles = n_tiles * k_tiles; + const uint32_t m_tiles = gridDim.x / nk_tiles; + const uint32_t m_tile_swizzle = blockIdx.x / nk_tiles / M_SWIZZLE * M_SWIZZLE; + /// do m_swizzle when there are enough m_tiles + const bool swizzle_m = m_tile_swizzle + M_SWIZZLE <= m_tiles; + const uint32_t current_m = swizzle_m ? m_tile_swizzle + blockIdx.x % M_SWIZZLE : blockIdx.x / nk_tiles; + + const uint64_t stride_k = N; + const uint64_t out_stride_nk = N * K; + + const uint32_t current_nk = swizzle_m ? blockIdx.x / M_SWIZZLE % nk_tiles : blockIdx.x % nk_tiles; + const uint32_t ti = current_nk / k_tiles; + const uint32_t tj = current_nk % k_tiles; + + const uint32_t current_n_size = (ti == (n_tiles - 1) && (N % BIG_TILE_SIZE_N) != 0) ? (N % BIG_TILE_SIZE_N) : BIG_TILE_SIZE_N; + const uint32_t current_k_size = (tj == (k_tiles - 1) && (K % BIG_TILE_SIZE_K) != 0) ? (K % BIG_TILE_SIZE_K) : BIG_TILE_SIZE_K; + + // use 128bit load&store whenever possible + if (current_n_size % 8 == 0 && current_k_size % elements_in_16B == 0) + { + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = row_bytes_wr / elements_in_16B; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + // Make sure WG isn't too large + static_assert(wr_per_row >= 1); + + const _T0 *pa = (const _T0 *)a + ti * row_bytes_wr + current_m * stride_k; + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T *pc = (const _T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < wr_per_row; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col * elements_in_16B < current_n_size && row < current_k_size) + { + uint64_t offset_a = col * elements_in_16B; + uint64_t offset = row * stride_k + col * elements_in_16B; + BLOCK_16B d, f; + if (types_match) + { + const __uint128_t *pfa = (const __uint128_t *)(pa + offset_a); + const __uint128_t *pfb = (const __uint128_t *)(pb + offset); + f.ow = *pfa; + d.ow = *pfb; +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + d.e[i] = performOperation<_T, Operation, order_flag>(static_cast<_T>(f.e[i]), static_cast<_T>(d.e[i])); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + else + { + const _T0 *pfa = (const _T0 *)(pa + offset_a); + const _T1 *pfb = (const _T1 *)(pb + offset); +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + float a = static_cast(pfa[i]); + float b = static_cast(pfb[i]); + float c = performOperation(a, b); + d.e[i] = static_cast<_T>(c); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + } + } + } + else + { + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = BIG_TILE_SIZE_N; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + const _T0 *pa = (const _T0 *)a + ti * row_bytes_wr + current_m * stride_k; + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T *pc = (const _T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < wr_per_row; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col < current_n_size && row < current_k_size) + { + uint64_t offset_a = col; + uint64_t offset = row * stride_k + col; + const _T0 *pfa = (const _T0 *)(pa + offset_a); + const _T1 *pfb = (const _T1 *)(pb + offset); + _T *pfc = (_T *)(pc + offset); + if (types_match) + { + *pfc = performOperation<_T, Operation, order_flag>(static_cast<_T>(*pfa), static_cast<_T>(*pfb)); + } + else + { + float a = static_cast(*pfa); + float b = static_cast(*pfb); + float c = performOperation(a, b); + *pfc = static_cast<_T>(c); + } + } + } + } + } + + template + __global__ void operator_bcast_tile_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int M, const int N, const int K, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); // in bytes + constexpr uint32_t elements_in_16B = 16 / element_size; + uint64_t idx = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + uint32_t n_tiles = N / _rows; + uint32_t k_tiles = K / elements_in_16B; + if (idx < (uint64_t)M * n_tiles * k_tiles) + { + uint32_t ti = idx / (k_tiles * n_tiles); + uint64_t idx_block = idx % (k_tiles * n_tiles); + uint32_t tj = (idx_block / k_tiles) % n_tiles; + uint32_t tk = idx_block % k_tiles; + for (int row = 0; row < _rows; row++) + { + uint64_t offset_b = (uint64_t)(tj + row * n_tiles) * K + tk * elements_in_16B; + uint64_t offset_ac = (uint64_t)(tj + row * n_tiles) * K + tk * elements_in_16B + (uint64_t)ti * N * K; + const _T0 *pa = reinterpret_cast(a) + offset_ac; + const _T1 *pb = reinterpret_cast(b) + offset_b; + _T *pc = reinterpret_cast<_T *>(c) + offset_ac; + for (int col = 0; col < elements_in_16B; col++) + { + const _T0 *pfa = pa + col; + const _T1 *pfb = pb + col; + _T *pfc = pc + col; + if (types_match) + { + *pfc = performOperation<_T, Operation, order_flag>(static_cast<_T>(*pfa), static_cast<_T>(*pfb)); + } + else + { + float t0 = static_cast(*pfa); + float t1 = static_cast(*pfb); + float t2 = performOperation(t0, t1); + *pfc = static_cast<_T>(t2); + } + } + } + } + } + + // (m, n, k), (k,) + template + __global__ void operator_bcastK_unroll_vectorize_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int m, const int n, const int k, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index + (uint64_t)((_rows - 1) * blockDim.x * vec_size) < (m * n * k); index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; + uint64_t other_start = (index + block_offset) % k; +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t input_offset = index + block_offset + vec_index; + uint64_t other_offset = other_start + vec_index; + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[input_offset]); + _T t1 = static_cast<_T>(b_ptr[other_offset]); + *(c_ptr + input_offset) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[input_offset]); + float t1 = static_cast(b_ptr[other_offset]); + float t2 = performOperation(t0, t1); + *(c_ptr + input_offset) = static_cast<_T>(t2); + } + } + } + } + } + + // (m, n, k), (1) + template + __global__ void operator_bcast_scalar_unroll_vectorize_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, const int n) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1 b_val = *reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + _T b_T = static_cast<_T>(b_val); + for (uint64_t index = idx; index + (uint64_t)((_rows - 1) * blockDim.x * vec_size) < n; index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t input_offset = index + block_offset + vec_index; + _T t0 = static_cast<_T>(a_ptr[input_offset]); + *(c_ptr + input_offset) = performOperation<_T, Operation, order_flag>(t0, b_T); + } + } + } + } + + // (m, n, k), (m, 1, k) + template + __global__ void operator_bcastM1K_unroll_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int m, const int n, const int k, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index + (uint64_t)((_rows - 1) * blockDim.x * vec_size) < (m * n * k); index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; + uint64_t other_m_index = (index + block_offset) / (n * k); + uint64_t other_k_index = (index + block_offset) % k; +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t input_offset = index + block_offset + vec_index; + uint64_t other_offset = other_m_index * k + other_k_index + vec_index; + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[input_offset]); + _T t1 = static_cast<_T>(b_ptr[other_offset]); + *(c_ptr + input_offset) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[input_offset]); + float t1 = static_cast(b_ptr[other_offset]); + float t2 = performOperation(t0, t1); + *(c_ptr + input_offset) = static_cast<_T>(t2); + } + } + } + } + } + + // (m, n, k), (m, n, 1) + template + __global__ void operator_bcastMN1_unroll_vec_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int forward_dim, const int bcast_dim, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index + (uint64_t)((_rows - 1) * blockDim.x * vec_size) < (forward_dim * bcast_dim); index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; + uint64_t other_offset = (index + block_offset) / bcast_dim; +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t input_offset = index + block_offset + vec_index; + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[input_offset]); + _T t1 = static_cast<_T>(b_ptr[other_offset]); + *(c_ptr + input_offset) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[input_offset]); + float t1 = static_cast(b_ptr[other_offset]); + float t2 = performOperation(t0, t1); + *(c_ptr + input_offset) = static_cast<_T>(t2); + } + } + } + } + } + + template + __global__ void operator_bcastMN1_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int forward_dim, const int bcast_dim, bool types_match) + { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index < forward_dim * bcast_dim; index += blockDim.x * gridDim.x) + { + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[index]); + _T t1 = static_cast<_T>(b_ptr[index / bcast_dim]); + *(c_ptr + index) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[index]); + float t1 = static_cast(b_ptr[index / bcast_dim]); + float t2 = performOperation(t0, t1); + *(c_ptr + index) = static_cast<_T>(t2); + } + } + } + + template + __global__ void operator_bcast1N1_unroll_vec_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int m, const int n, const int k, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index + (uint64_t)((_rows - 1) * blockDim.x * vec_size) < (m * n * k); index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; + uint64_t other_offset = (index + block_offset) % (n * k) / k; +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t input_offset = index + block_offset + vec_index; + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[input_offset]); + _T t1 = static_cast<_T>(b_ptr[other_offset]); + *(c_ptr + input_offset) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[input_offset]); + float t1 = static_cast(b_ptr[other_offset]); + float t2 = performOperation(t0, t1); + *(c_ptr + input_offset) = static_cast<_T>(t2); + } + } + } + } + } + + template + __global__ void operator_bcast1N1_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int m, const int n, const int k, bool types_match) + { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index < m * n * k; index += blockDim.x * gridDim.x) + { + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[index]); + _T t1 = static_cast<_T>(b_ptr[index % (n * k) / k]); + *(c_ptr + index) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[index]); + float t1 = static_cast(b_ptr[index % (n * k) / k]); + float t2 = performOperation(t0, t1); + *(c_ptr + index) = static_cast<_T>(t2); + } + } + } + + template + __global__ void operator_bcastN11_unroll_vec_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int m, const int n, const int k, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index + (uint64_t)((_rows - 1) * blockDim.x * vec_size) < (m * n * k); index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; + // uint64_t other_offset = (index + block_offset) / (n * k); +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t input_offset = index + block_offset + vec_index; + uint64_t other_offset = input_offset / (n * k); + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[input_offset]); + _T t1 = static_cast<_T>(b_ptr[other_offset]); + *(c_ptr + input_offset) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[input_offset]); + float t1 = static_cast(b_ptr[other_offset]); + float t2 = performOperation(t0, t1); + *(c_ptr + input_offset) = static_cast<_T>(t2); + } + } + } + } + } + + template + __global__ void operator_bcastN11_unroll_vec_pad(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int m, const int n, const int k, const int padded_size, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index + (uint64_t)((_rows - 1) * blockDim.x * vec_size) < padded_size; index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; + // uint64_t other_offset = (index + block_offset) / (n * k); + if (index + block_offset < (m * n * k)) + { +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t input_offset = index + block_offset + vec_index; + uint64_t other_offset = input_offset / (n * k); + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[input_offset]); + _T t1 = static_cast<_T>(b_ptr[other_offset]); + *(c_ptr + input_offset) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[input_offset]); + float t1 = static_cast(b_ptr[other_offset]); + float t2 = performOperation(t0, t1); + *(c_ptr + input_offset) = static_cast<_T>(t2); + } + } + } + } + } + } + + + template + __global__ void operator_bcastN11_naive(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int m, const int n, const int k, bool types_match) + { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index < m * n * k; index += blockDim.x * gridDim.x) + { + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[index]); + _T t1 = static_cast<_T>(b_ptr[index / (n * k)]); + *(c_ptr + index) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[index]); + float t1 = static_cast(b_ptr[index / (n * k)]); + float t2 = performOperation(t0, t1); + *(c_ptr + index) = static_cast<_T>(t2); + } + } + } + + template + __global__ void operator_contiguous_kernel_naive(const void* __restrict a, const void* __restrict b, void* __restrict c, + const int n, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); + constexpr uint32_t vec_size = 16 / element_size; + uint64_t idx = (uint64_t)(blockIdx.x * blockDim.x * _rows + threadIdx.x) * vec_size; + const _T0* a_ptr = reinterpret_cast(a); + const _T1* b_ptr = reinterpret_cast(b); + _T* c_ptr = reinterpret_cast<_T*>(c); + for (uint64_t index = idx; index + (uint64_t)(_rows - 1) * blockDim.x * vec_size < n; index += gridDim.x * blockDim.x * _rows * vec_size) + { +#pragma unroll + for (int unroll_index = 0; unroll_index < _rows; ++unroll_index) + { + uint64_t block_offset = (uint64_t)blockDim.x * vec_size * unroll_index; +#pragma unroll + for (int vec_index = 0; vec_index < vec_size; ++vec_index) + { + uint64_t offset = index + block_offset + vec_index; + if (types_match) + { + _T t0 = static_cast<_T>(a_ptr[offset]); + _T t1 = static_cast<_T>(b_ptr[offset]); + *(c_ptr + offset) = performOperation<_T, Operation, order_flag>(t0, t1); + } + else + { + float t0 = static_cast(a_ptr[offset]); + float t1 = static_cast(b_ptr[offset]); + float t2 = performOperation(t0, t1); + *(c_ptr + offset) = static_cast<_T>(t2); + } + } + } + } + } + + template + __global__ void operator_contiguous_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int M, const int N, const int K, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); // in bytes + constexpr uint32_t elements_in_16B = 16 / element_size; + uint64_t idx = (uint64_t)blockIdx.x * blockDim.x + threadIdx.x; + uint32_t n_tiles = N / _rows; + uint32_t k_tiles = K / elements_in_16B; + if (idx < (uint64_t)M * n_tiles * k_tiles) + { + uint32_t ti = idx / (k_tiles * n_tiles); + uint64_t idx_block = idx % (k_tiles * n_tiles); + uint32_t tj = (idx_block / k_tiles) % n_tiles; + uint32_t tk = idx_block % k_tiles; + for (int row = 0; row < _rows; row++) + { + uint64_t offset = (uint64_t)(tj + row * n_tiles) * K + tk * elements_in_16B + (uint64_t)ti * N * K; + const _T0 *pa = reinterpret_cast(a) + offset; + const _T1 *pb = reinterpret_cast(b) + offset; + _T *pc = reinterpret_cast<_T *>(c) + offset; + for (int col = 0; col < elements_in_16B; col++) + { + const _T0 *pfa = pa + col; + const _T1 *pfb = pb + col; + _T *pfc = pc + col; + if (types_match) + { + *pfc = performOperation<_T, Operation, order_flag>(static_cast<_T>(*pfa), static_cast<_T>(*pfb)); + } + else + { + float t0 = static_cast(*pfa); + float t1 = static_cast(*pfb); + float t2 = performOperation(t0, t1); + *pfc = static_cast<_T>(t2); + } + } + } + } + } + + template + __global__ void operator_element_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int size, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); // in bytes + constexpr uint32_t elements_in_16B = 16 / element_size; + uint64_t idx = ((uint64_t)blockIdx.x * blockDim.x + threadIdx.x); + if (idx * elements_in_16B < size) + { + int offset = idx * elements_in_16B; + const _T0 *pa = reinterpret_cast(a) + offset; + const _T1 *pb = reinterpret_cast(b) + offset; + _T *pc = reinterpret_cast<_T *>(c) + offset; +#pragma unroll + for (uint32_t v = 0; v < elements_in_16B; v++) + { + if (types_match) + { + pc[v] = performOperation<_T, Operation, true>(static_cast<_T>(pa[v]), static_cast<_T>(pb[v])); + } + else + { + float t0 = static_cast(pa[v]); + float t1 = static_cast(pb[v]); + float t2 = performOperation(t0, t1); + pc[v] = static_cast<_T>(t2); + } + } + } + } + + template + __global__ void operator_contiguous_big_tile_kernel(const void *__restrict a, const void *__restrict b, void *__restrict c, + const int N, const int K, bool types_match) + { + constexpr uint32_t element_size = sizeof(_T); // in bytes + constexpr uint32_t elements_in_16B = 16 / element_size; + + union BLOCK_16B + { + _T e[elements_in_16B]; + __uint128_t ow; + }; + + // Round up processing to next full tile + const uint32_t n_tiles = (N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N; + const uint32_t k_tiles = (K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K; + const uint32_t nk_tiles = n_tiles * k_tiles; + const uint32_t m_tiles = gridDim.x / nk_tiles; + const uint32_t m_tile_swizzle = blockIdx.x / nk_tiles / M_SWIZZLE * M_SWIZZLE; + /// do m_swizzle when there are enough m_tiles + const bool swizzle_m = m_tile_swizzle + M_SWIZZLE <= m_tiles; + const uint32_t current_m = swizzle_m ? m_tile_swizzle + blockIdx.x % M_SWIZZLE : blockIdx.x / nk_tiles; + + const uint64_t stride_k = N; + const uint64_t out_stride_nk = N * K; + + const uint32_t current_nk = swizzle_m ? blockIdx.x / M_SWIZZLE % nk_tiles : blockIdx.x % nk_tiles; + const uint32_t ti = current_nk / k_tiles; + const uint32_t tj = current_nk % k_tiles; + + const uint32_t current_n_size = (ti == (n_tiles - 1) && (N % BIG_TILE_SIZE_N) != 0) ? (N % BIG_TILE_SIZE_N) : BIG_TILE_SIZE_N; + const uint32_t current_k_size = (tj == (k_tiles - 1) && (K % BIG_TILE_SIZE_K) != 0) ? (K % BIG_TILE_SIZE_K) : BIG_TILE_SIZE_K; + + // use 128bit load&store whenever possible + if (current_n_size % 8 == 0 && current_k_size % elements_in_16B == 0) + { + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = row_bytes_wr / elements_in_16B; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + // Make sure WG isn't too large + static_assert(wr_per_row >= 1); + + const _T0 *pa = (const _T0 *)a + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T *pc = (const _T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < wr_per_row; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col * elements_in_16B < current_n_size && row < current_k_size) + { + BLOCK_16B d, f; + uint64_t offset = row * stride_k + col * elements_in_16B; + if (types_match) + { + const __uint128_t *pfa = (const __uint128_t *)(pa + offset); + const __uint128_t *pfb = (const __uint128_t *)(pb + offset); + f.ow = *pfa; + d.ow = *pfb; +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + d.e[i] = performOperation<_T, Operation, order_flag>(static_cast<_T>(f.e[i]), static_cast<_T>(d.e[i])); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + else + { + const _T0 *pfa = (const _T0 *)(pa + offset); + const _T1 *pfb = (const _T1 *)(pb + offset); +#pragma unroll + for (uint32_t i = 0; i < elements_in_16B; i++) + { + float a = static_cast(pfa[i]); + float b = static_cast(pfb[i]); + float c = performOperation(a, b); + d.e[i] = static_cast<_T>(c); + } + __uint128_t *pfc = (__uint128_t *)(pc + offset); + *pfc = d.ow; + } + } + } + } + else + { + // Copy full tile with large loads + constexpr uint32_t row_bytes_wr = BIG_TILE_SIZE_N; + constexpr uint32_t vmem_per_row_wr = BIG_TILE_SIZE_N; + constexpr uint32_t rows_per_wg_wr = _WG / vmem_per_row_wr; + constexpr uint32_t wr_per_row = BIG_TILE_SIZE_K / rows_per_wg_wr; + const _T0 *pa = (const _T0 *)a + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T1 *pb = (const _T1 *)b + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; + const _T *pc = (const _T *)c + tj * BIG_TILE_SIZE_K * stride_k + ti * row_bytes_wr + current_m * out_stride_nk; +#pragma unroll + for (uint32_t t = 0; t < wr_per_row; t++) + { + uint32_t col = threadIdx.x % vmem_per_row_wr; + uint32_t row = threadIdx.x / vmem_per_row_wr + t * rows_per_wg_wr; + if (col < current_n_size && row < current_k_size) + { + uint64_t offset = row * stride_k + col; + const _T0 *pfa = (const _T0 *)(pa + offset); + const _T1 *pfb = (const _T1 *)(pb + offset); + _T *pfc = (_T *)(pc + offset); + if (types_match) + { + *pfc = performOperation<_T, Operation, order_flag>(static_cast<_T>(*pfa), static_cast<_T>(*pfb)); + } + else + { + float a = static_cast(*pfa); + float b = static_cast(*pfb); + float c = performOperation(a, b); + *pfc = static_cast<_T>(c); + } + } + } + } + } +} // namespace aiter + +__inline__ std::vector broadcastShapes(const torch::Tensor &tensor1, const torch::Tensor &tensor2) +{ + auto shape1 = tensor1.sizes().vec(); + auto shape2 = tensor2.sizes().vec(); + + int64_t max_dim = std::max(shape1.size(), shape2.size()); + shape1.insert(shape1.begin(), max_dim - shape1.size(), 1); + shape2.insert(shape2.begin(), max_dim - shape2.size(), 1); + + std::vector result_shape(max_dim, 1); + for (int64_t i = 0; i < max_dim; ++i) + { + if (shape1[i] == 1) + { + result_shape[i] = shape2[i]; + } + else if (shape2[i] == 1) + { + result_shape[i] = shape1[i]; + } + else if (shape1[i] == shape2[i]) + { + result_shape[i] = shape1[i]; + } + else + { + throw std::invalid_argument("Incompatible shapes for binary operator."); + } + } + + return result_shape; +} + +template +struct BinaryOperationPattern; + +// PATTERN_TRANSPOSE +template +struct BinaryOperationPattern<1, Operation, _T0, _T1> +{ + static void apply(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) + { + int dim = input.dim(); + auto shape = output.sizes().vec(); + void *buf_a = reinterpret_cast(input.data_ptr()); + void *buf_b = reinterpret_cast(other.data_ptr()); + void *buf_c = reinterpret_cast(output.data_ptr()); + + int num_elements = output.numel(); + int rem_dim_size = num_elements / (shape[dim - 2] * shape[dim - 1]); + int M = dim == 2 ? 1 : rem_dim_size; + int N = shape[dim - 2]; + int K = shape[dim - 1]; + + auto tensor_not_conti = input.is_contiguous() ? other : input; + int stride0 = tensor_not_conti.stride(0); + int stride2 = tensor_not_conti.stride(2); + constexpr uint32_t BIG_TILE_SIZE_N = 64; + constexpr uint32_t BIG_TILE_SIZE_K = 64; + constexpr uint32_t M_SWIZZLE = 8; + const int grid_x = M * ((N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N) * ((K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K); + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(256, 1, 1); + const hipStream_t stream = at::hip::getCurrentHIPStream(); + bool types_match = typeid(_T0) == typeid(_T1); + + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_tn_big_tile_kernel", [&] + { aiter::operator_tn_big_tile_kernel + <<>>(buf_a, buf_b, buf_c, K, N, stride0, stride2, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_tn_big_tile_kernel", [&] + { aiter::operator_tn_big_tile_kernel + <<>>(buf_b, buf_a, buf_c, K, N, stride0, stride2, types_match); }); + } + } +}; + +// PATTERN_BROADCAST_0 +template +struct BinaryOperationPattern<2, Operation, _T0, _T1> +{ + static void apply(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) + { + int dim = input.dim(); + auto shape = output.sizes().vec(); + + void *buf_a = reinterpret_cast(input.data_ptr()); + void *buf_b = reinterpret_cast(other.data_ptr()); + void *buf_c = reinterpret_cast(output.data_ptr()); + int num_elements = output.numel(); + int rem_dim_size = num_elements / (shape[dim - 2] * shape[dim - 1]); + int M = dim != 3 ? 1 : rem_dim_size; + int N = shape[dim - 2]; + int K = shape[dim - 1]; + if (dim == 4) + { + N = shape[0] * shape[1] * shape[2]; + K = shape[3]; + } + const hipStream_t stream = at::hip::getCurrentHIPStream(); + bool types_match = typeid(_T0) == typeid(_T1); + + const uint32_t rows = 8; + int vec = 16 / output.element_size(); + + hipDevice_t dev; + hipDeviceProp_t dev_prop; + hipGetDevice(&dev); + hipGetDeviceProperties(&dev_prop, dev); + uint32_t num_cu = dev_prop.multiProcessorCount; + + bool bcast_k_dim = (input.dim() == 1 && input.size(0) == shape[2]) || (other.dim() == 1 && other.size(0) == shape[2]); + bool bcast_scalar = (input.dim() == 1 && input.size(0) == 1) || (other.dim() == 1 && other.size(0) == 1); + bool vec_unroll_able = num_elements % (rows * vec * 256) == 0 && shape[2] % vec == 0; + + // (m,n,k), (k,) + if (bcast_k_dim && vec_unroll_able && output.dim() == 3) + { + int grid_x = (num_elements / (rows * vec) + 256 - 1) / 256; + int occupancy; + auto kernel_ptr = aiter::operator_bcastK_unroll_vectorize_naive<_T0, rows, Operation, true, _T0, _T1>; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, reinterpret_cast(kernel_ptr), 256, 0); + grid_x = grid_x < num_cu * occupancy ? grid_x : num_cu * occupancy; + const dim3 block_dim(256, 1, 1); + const dim3 grid_dim(grid_x, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastK_unroll_vectorize_naive", [&] + { aiter::operator_bcastK_unroll_vectorize_naive + <<>>(buf_a, buf_b, buf_c, M, N, K, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastK_unroll_vectorize_naive", [&] + { aiter::operator_bcastK_unroll_vectorize_naive + <<>>(buf_b, buf_a, buf_c, shape[0], shape[1], shape[2], types_match); }); + } + } + // (m, n, k), (1) + else if (bcast_scalar && num_elements % (rows * vec * 256) == 0) + { + int grid_x = (num_elements / (rows * vec) + 256 - 1) / 256; + int occupancy; + auto kernel_ptr = aiter::operator_bcast_scalar_unroll_vectorize_naive<_T0, rows, Operation, true, _T0, _T1>; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, reinterpret_cast(kernel_ptr), 256, 0); + grid_x = grid_x < num_cu * occupancy ? grid_x : num_cu * occupancy; + const dim3 block_dim(256, 1, 1); + const dim3 grid_dim(grid_x, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast_scalar_unroll_vectorize_naive", [&] + { aiter::operator_bcast_scalar_unroll_vectorize_naive + <<>>(buf_a, buf_b, buf_c, num_elements); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast_scalar_unroll_vectorize_naive", [&] + { aiter::operator_bcast_scalar_unroll_vectorize_naive + <<>>(buf_b, buf_a, buf_c, num_elements); }); + } + } + else if (N % rows == 0 && K % vec == 0) + { + constexpr uint32_t wg = 64; + int grid_x = (num_elements / (rows * vec) + wg - 1) / wg; + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(wg, 1, 1); + + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast_tile_kernel", [&] + { aiter::operator_bcast_tile_kernel + <<>>(buf_a, buf_b, buf_c, M, N, K, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast_tile_kernel", [&] + { aiter::operator_bcast_tile_kernel + <<>>(buf_b, buf_a, buf_c, M, N, K, types_match); }); + } + } + else + { + constexpr uint32_t BIG_TILE_SIZE_N = 64; + constexpr uint32_t BIG_TILE_SIZE_K = 64; + constexpr uint32_t M_SWIZZLE = 8; + const int grid_x = M * ((N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N) * ((K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K); + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(256, 1, 1); + + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast_big_tile_kernel", [&] + { aiter::operator_bcast_big_tile_kernel + <<>>(buf_a, buf_b, buf_c, K, N, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast_big_tile_kernel", [&] + { aiter::operator_bcast_big_tile_kernel + <<>>(buf_b, buf_a, buf_c, K, N, types_match); }); + } + } + } +}; + +// PATTERN_BROADCAST_1 +template +struct BinaryOperationPattern<3, Operation, _T0, _T1> +{ + static void apply(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) + { + int dim = input.dim(); + auto shape = output.sizes().vec(); + void *buf_a = reinterpret_cast(input.data_ptr()); + void *buf_b = reinterpret_cast(other.data_ptr()); + void *buf_c = reinterpret_cast(output.data_ptr()); + + int num_elements = output.numel(); + int rem_dim_size = num_elements / (shape[dim - 2] * shape[dim - 1]); + int M = dim == 2 ? 1 : rem_dim_size; + int N = shape[dim - 2]; + int K = shape[dim - 1]; + + // (m, n, p, q), (m, 1, p, q) + if (dim == 4 && input.size(1) != other.size(1) && (input.size(1) == 1 || other.size(1))) + { + M = shape[0]; + N = shape[1]; + K = shape[2] * shape[3]; + } + + constexpr uint32_t BIG_TILE_SIZE_N = 64; + constexpr uint32_t BIG_TILE_SIZE_K = 64; + constexpr uint32_t M_SWIZZLE = 8; + int grid_x = M * ((N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N) * ((K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K); + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(256, 1, 1); + const hipStream_t stream = at::hip::getCurrentHIPStream(); + bool types_match = typeid(_T0) == typeid(_T1); + constexpr int rows = 8; + int vec_size = 16 / output.element_size(); + if (K % vec_size == 0 && num_elements % (256 * rows * vec_size) == 0) + { + hipDevice_t dev; + hipDeviceProp_t dev_prop; + hipGetDevice(&dev); + hipGetDeviceProperties(&dev_prop, dev); + uint32_t num_cu = dev_prop.multiProcessorCount; + grid_x = (num_elements / (rows * vec_size) + 256 - 1) / 256; + int occupancy; + auto kernel_ptr = aiter::operator_bcastM1K_unroll_kernel<_T0, rows, Operation, true, _T0, _T1>; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, reinterpret_cast(kernel_ptr), 256, 0); + grid_x = grid_x < num_cu * occupancy ? grid_x : num_cu * occupancy; + + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(256, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastM1K_unroll_kernel", [&] + { aiter::operator_bcastM1K_unroll_kernel + <<>>(buf_a, buf_b, buf_c, M, N, K, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastM1K_unroll_kernel", [&] + { aiter::operator_bcastM1K_unroll_kernel + <<>>(buf_b, buf_a, buf_c, M, N, K, types_match); }); + } + } + else + { + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast1_big_tile_kernel", [&] + { aiter::operator_bcast1_big_tile_kernel + <<>>(buf_a, buf_b, buf_c, K, N, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast1_big_tile_kernel", [&] + { aiter::operator_bcast1_big_tile_kernel + <<>>(buf_b, buf_a, buf_c, K, N, types_match); }); + } + } + } +}; + +// PATTERN_BROADCAST_2 +template +struct BinaryOperationPattern<5, Operation, _T0, _T1> +{ + static void apply(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) + { + int dim = input.dim(); + auto shape = output.sizes().vec(); + void *buf_a = reinterpret_cast(input.data_ptr()); + void *buf_b = reinterpret_cast(other.data_ptr()); + void *buf_c = reinterpret_cast(output.data_ptr()); + + int bcast_dim = order_flag ? input.numel() / other.numel() : other.numel() / input.numel(); + int forward_dim = order_flag ? other.numel() : input.numel(); + + int num_elements = output.numel(); + int vec_size = 16 / output.element_size(); + constexpr uint32_t row = 8; + const hipStream_t stream = at::hip::getCurrentHIPStream(); + bool types_match = typeid(_T0) == typeid(_T1); + + // optimize kernel + if (bcast_dim % vec_size == 0 && forward_dim % row == 0 && num_elements % (256 * vec_size * row) == 0) + { + hipDevice_t dev; + hipDeviceProp_t dev_prop; + hipGetDevice(&dev); + hipGetDeviceProperties(&dev_prop, dev); + uint32_t num_cu = dev_prop.multiProcessorCount; + + constexpr uint32_t wg = 256; + int grid_x = (num_elements / (row * vec_size) + wg - 1) / wg; + int occupancy; + auto kernel_ptr = aiter::operator_bcastMN1_unroll_vec_naive<_T0, row, Operation, true, _T0, _T1>; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, reinterpret_cast(kernel_ptr), wg, 0); + grid_x = grid_x < num_cu * occupancy ? grid_x : num_cu * occupancy; + + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(wg, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastMN1_unroll_vec_naive", [&] + { aiter::operator_bcastMN1_unroll_vec_naive + <<>>(buf_a, buf_b, buf_c, forward_dim, bcast_dim, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastMN1_unroll_vec_naive", [&] + { aiter::operator_bcastMN1_unroll_vec_naive + <<>>(buf_b, buf_a, buf_c, forward_dim, bcast_dim, types_match); }); + } + } + // fallback + else + { + const dim3 block_dim(256, 1, 1); + const dim3 grid_dim((num_elements + 256 - 1) / 256, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastMN1_naive", [&] + { aiter::operator_bcastMN1_naive + <<>>(buf_a, buf_b, buf_c, forward_dim, bcast_dim, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastMN1_naive", [&] + { aiter::operator_bcastMN1_naive + <<>>(buf_b, buf_a, buf_c, forward_dim, bcast_dim, types_match); }); + } + } + } +}; + +// PATTERN_BROADCAST_3 +template +struct BinaryOperationPattern<6, Operation, _T0, _T1> +{ + static void apply(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) + { + int dim = output.dim(); + auto shape = output.sizes().vec(); + void *buf_a = reinterpret_cast(input.data_ptr()); + void *buf_b = reinterpret_cast(other.data_ptr()); + void *buf_c = reinterpret_cast(output.data_ptr()); + + int m = output.size(0); + int n = output.size(1); + int k = output.size(2); + + int num_elements = output.numel(); + int vec_size = 16 / output.element_size(); + constexpr uint32_t row = 8; + const hipStream_t stream = at::hip::getCurrentHIPStream(); + bool types_match = typeid(_T0) == typeid(_T1); + + // optimize kernel + if (k % vec_size == 0 && num_elements % (256 * vec_size * row) == 0) + { + hipDevice_t dev; + hipDeviceProp_t dev_prop; + hipGetDevice(&dev); + hipGetDeviceProperties(&dev_prop, dev); + uint32_t num_cu = dev_prop.multiProcessorCount; + + constexpr uint32_t wg = 256; + int grid_x = (num_elements / (row * vec_size) + wg - 1) / wg; + int occupancy; + auto kernel_ptr = aiter::operator_bcast1N1_unroll_vec_naive<_T0, row, Operation, true, _T0, _T1>; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, reinterpret_cast(kernel_ptr), wg, 0); + grid_x = grid_x < num_cu * occupancy ? grid_x : num_cu * occupancy; + + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(wg, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast1N1_unroll_vec_naive", [&] + { aiter::operator_bcast1N1_unroll_vec_naive + <<>>(buf_a, buf_b, buf_c, m, n, k, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast1N1_unroll_vec_naive", [&] + { aiter::operator_bcast1N1_unroll_vec_naive + <<>>(buf_b, buf_a, buf_c, m, n, k, types_match); }); + } + } + // fallback + else + { + const dim3 block_dim(256, 1, 1); + const dim3 grid_dim((num_elements + 256 - 1) / 256, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast1N1_naive", [&] + { aiter::operator_bcast1N1_naive + <<>>(buf_a, buf_b, buf_c, m, n, k, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcast1N1_naive", [&] + { aiter::operator_bcast1N1_naive + <<>>(buf_b, buf_a, buf_c, m, n, k, types_match); }); + } + } + } +}; + +// PATTERN_BROADCAST_4 +// broadcast last 2 dim, (m, n, k) (m, 1, 1) +template +struct BinaryOperationPattern<7, Operation, _T0, _T1> +{ + static void apply(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) + { + int dim = output.dim(); + auto shape = output.sizes().vec(); + void *buf_a = reinterpret_cast(input.data_ptr()); + void *buf_b = reinterpret_cast(other.data_ptr()); + void *buf_c = reinterpret_cast(output.data_ptr()); + + int m = output.size(0); + int n = output.size(1); + int k = output.size(2); + + int num_elements = output.numel(); + int vec_size = 16 / output.element_size(); + constexpr uint32_t row = 8; + const hipStream_t stream = at::hip::getCurrentHIPStream(); + bool types_match = typeid(_T0) == typeid(_T1); + + // optimize kernel + // if (k % vec_size == 0 && num_elements % (256 * vec_size * row) == 0) + if (num_elements % vec_size == 0) + { + hipDevice_t dev; + hipDeviceProp_t dev_prop; + hipGetDevice(&dev); + hipGetDeviceProperties(&dev_prop, dev); + uint32_t num_cu = dev_prop.multiProcessorCount; + + constexpr uint32_t wg = 256; + int tmp_row;// = row; + int grid_x = (num_elements / (row * vec_size) + wg - 1) / wg; + int occupancy; + bool need_pad = true; + int padded_size = num_elements; + + auto ifNeedPad = [=] (int tmp_row) + { + return num_elements % (wg * tmp_row * vec_size) != 0; + }; + + auto getPaddedSize = [=] (int tmp_row) + { + int elem_num_per_block = wg * vec_size * tmp_row; + return ((num_elements + elem_num_per_block - 1) / elem_num_per_block) * elem_num_per_block; + }; + +#define GET_PATTERN(_row) \ + do \ + { \ + tmp_row = _row; \ + grid_x = (num_elements / (tmp_row * vec_size) + wg - 1) / wg; \ + auto kernel_ptr = aiter::operator_bcast1N1_unroll_vec_naive<_T0, _row, Operation, true, _T0, _T1>; \ + hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, reinterpret_cast(kernel_ptr), wg, 0); \ + need_pad = ifNeedPad(tmp_row); \ + if (need_pad) \ + { \ + padded_size = getPaddedSize(tmp_row); \ + } \ + } while(0) + + if (grid_x > 512 && num_elements % (8 * vec_size) == 0) // row = 8 + { + GET_PATTERN(8); + } + else if (grid_x > 256 && num_elements % (4 * vec_size) == 0) // row = 4 + { + GET_PATTERN(4); + } + else if (grid_x > 128 && num_elements % (2 * vec_size) == 0) // row = 2 + { + GET_PATTERN(2); + } + else // row = 1 + { + GET_PATTERN(1); + } + grid_x = grid_x < num_cu * occupancy ? grid_x : num_cu * occupancy; + +#define BCAST_CASE(case_row, normal_tensor, bcast_tensor, order_flag) \ + do \ + { \ + case case_row: \ + { \ + if (!need_pad) \ + { \ + VLLM_DISPATCH_FLOATING_TYPES( \ + output.scalar_type(), "operator_bcastN11_unroll_vec_naive", [&] \ + { aiter::operator_bcastN11_unroll_vec_naive \ + <<>>(normal_tensor, bcast_tensor, buf_c, m, n, k, types_match); }); \ + } \ + else \ + { \ + VLLM_DISPATCH_FLOATING_TYPES( \ + output.scalar_type(), "operator_bcastN11_unroll_vec_pad", [&] \ + { aiter::operator_bcastN11_unroll_vec_pad \ + <<>>(normal_tensor, bcast_tensor, buf_c, m, n, k, padded_size, types_match); }); \ + } \ + return; \ + } \ + } while(0) + + const dim3 grid_dim(grid_x); + const dim3 block_dim(wg); + if (order_flag) + { + switch (tmp_row) + { + BCAST_CASE(8, buf_a, buf_b, true); + BCAST_CASE(4, buf_a, buf_b, true); + BCAST_CASE(2, buf_a, buf_b, true); + BCAST_CASE(1, buf_a, buf_b, true); + } + } + else + { + switch(tmp_row) + { + BCAST_CASE(8, buf_b, buf_a, false); + BCAST_CASE(4, buf_b, buf_a, false); + BCAST_CASE(2, buf_b, buf_a, false); + BCAST_CASE(1, buf_b, buf_a, false); + } + } + } + // fallback + else + { + const dim3 block_dim(256, 1, 1); + const dim3 grid_dim((num_elements + 256 - 1) / 256, 1, 1); + if (order_flag) + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastN11_naive", [&] + { aiter::operator_bcastN11_naive + <<>>(buf_a, buf_b, buf_c, m, n, k, types_match); }); + } + else + { + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_bcastN11_naive", [&] + { aiter::operator_bcastN11_naive + <<>>(buf_b, buf_a, buf_c, m, n, k, types_match); }); + } + } + } +}; + +// PATTERN_CONTIGUOUS +template +struct BinaryOperationPattern<4, Operation, _T0, _T1> +{ + static void apply(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) + { + int dim = input.dim(); + auto shape = output.sizes().vec(); + + const uint32_t rows = 8; + void *buf_a = reinterpret_cast(input.data_ptr()); + void *buf_b = reinterpret_cast(other.data_ptr()); + void *buf_c = reinterpret_cast(output.data_ptr()); + int num_elements = output.numel(); + int rem_dim_size = 1; + int M, N, K; + if (dim == 1) + { + M = 1; + N = input.numel() / 128; + K = 128; + } + else + { + for (int i = 0; i < dim - 2; ++i) + { + rem_dim_size *= shape[i]; + } + M = dim == 3 ? shape[0] : rem_dim_size; + N = shape[dim - 2]; + K = shape[dim - 1]; + if (N < rows) + { + K = N * K; + N = M; + M = 1; + } + } + + const hipStream_t stream = at::hip::getCurrentHIPStream(); + bool types_match = typeid(_T0) == typeid(_T1); + int vec = 16 / output.element_size(); + hipDevice_t dev; + hipDeviceProp_t dev_prop; + hipGetDevice(&dev); + hipGetDeviceProperties(&dev_prop, dev); + uint32_t num_cu = dev_prop.multiProcessorCount; + + if (num_elements % vec == 0 && num_elements < num_cu * 256 * vec) + { + constexpr uint32_t wg = 256; + const int grid_x = (num_elements / vec + wg - 1) / wg; + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(wg, 1, 1); + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_element_kernel", [&] + { aiter::operator_element_kernel + <<>>(buf_a, buf_b, buf_c, num_elements, types_match); }); + } + else if (num_elements % (rows * vec * 256) == 0) + { + constexpr uint32_t wg = 256; + int grid_x = (num_elements / (rows * vec) + wg - 1) / wg; + int occupancy; + auto kernel_ptr = aiter::operator_contiguous_kernel_naive<_T0, rows, Operation, true, _T0, _T1>; + hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, reinterpret_cast(kernel_ptr), wg, 0); + grid_x = grid_x < num_cu * occupancy ? grid_x : num_cu * occupancy; + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(wg, 1, 1); + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_contiguous_kernel_naive", [&] + { aiter::operator_contiguous_kernel_naive + <<>>(buf_a, buf_b, buf_c, num_elements, types_match); }); + } + else if (N % rows == 0 && K % vec == 0) + { + constexpr uint32_t wg = 256; + const int grid_x = (num_elements / (rows * vec) + wg - 1) / wg; + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(wg, 1, 1); + + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_contiguous_kernel", [&] + { aiter::operator_contiguous_kernel + <<>>(buf_a, buf_b, buf_c, M, N, K, types_match); }); + } + else + { + constexpr uint32_t wg = 256; + constexpr uint32_t BIG_TILE_SIZE_N = 64; + constexpr uint32_t BIG_TILE_SIZE_K = 64; + constexpr uint32_t M_SWIZZLE = 8; + const int grid_x = M * ((N + BIG_TILE_SIZE_N - 1) / BIG_TILE_SIZE_N) * ((K + BIG_TILE_SIZE_K - 1) / BIG_TILE_SIZE_K); + const dim3 grid_dim(grid_x, 1, 1); + const dim3 block_dim(wg, 1, 1); + + VLLM_DISPATCH_FLOATING_TYPES( + output.scalar_type(), "operator_contiguous_big_tile_kernel", [&] + { aiter::operator_contiguous_big_tile_kernel + <<>>(buf_a, buf_b, buf_c, K, N, types_match); }); + } + } +}; + +template +void binary_operation_process(torch::Tensor &input, torch::Tensor &other, torch::Tensor &output, bool order_flag) +{ + BinaryOperationPattern::apply(input, other, output, order_flag); +} + +void binary_op_dispatch(const std::string& op_type, torch::Tensor &input, torch::Tensor &other, torch::Tensor &output); diff --git a/csrc/include/cache.h b/csrc/include/cache.h new file mode 100644 index 0000000000000000000000000000000000000000..0cb2f2ac5a0e0c304b950b7c8fad4222ed7ebbc5 --- /dev/null +++ b/csrc/include/cache.h @@ -0,0 +1,46 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +#include +#include + +void swap_blocks(torch::Tensor &src, torch::Tensor &dst, + const torch::Tensor &block_mapping); + +// Note: the key_caches and value_caches vectors are constant but +// not the Tensors they contain. The vectors need to be const refs +// in order to satisfy pytorch's C++ operator registration code. +void copy_blocks(std::vector const &key_caches, + std::vector const &value_caches, + const torch::Tensor &block_mapping); + +void reshape_and_cache(torch::Tensor &key, torch::Tensor &value, + torch::Tensor &key_cache, torch::Tensor &value_cache, + torch::Tensor &slot_mapping, + const std::string &kv_cache_dtype, const double k_scale, + const double v_scale, const bool asm_layout); + +void reshape_and_cache_flash(torch::Tensor &key, torch::Tensor &value, + torch::Tensor &key_cache, + torch::Tensor &value_cache, + torch::Tensor &slot_mapping, + const std::string &kv_cache_dtype, + torch::Tensor& k_scale, torch::Tensor& v_scale); + +void reshape_and_cache_with_pertoken_quant(torch::Tensor &key, torch::Tensor &value, + torch::Tensor &key_cache, torch::Tensor &value_cache, + torch::Tensor &k_dequant_scales, torch::Tensor &v_dequant_scales, + torch::Tensor &slot_mapping, + const bool asm_layout); + +void reshape_and_cache_with_block_quant(torch::Tensor &key, torch::Tensor &value, + torch::Tensor &key_cache, torch::Tensor &value_cache, + torch::Tensor &k_dequant_scales, torch::Tensor &v_dequant_scales, + torch::Tensor &slot_mapping, + const bool asm_layout); + +// Just for unittest +void convert_fp8(torch::Tensor &dst_cache, torch::Tensor &src_cache, + const double scale, const std::string &kv_cache_dtype); \ No newline at end of file diff --git a/csrc/include/ck_tile/vec_convert.h b/csrc/include/ck_tile/vec_convert.h new file mode 100644 index 0000000000000000000000000000000000000000..9b61b1e3b7bb39e6ac8b99106765bcb2ec4d7a02 --- /dev/null +++ b/csrc/include/ck_tile/vec_convert.h @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: MIT + +#pragma once +#include "aiter_hip_common.h" + +namespace ck_tile { +template +using vec_t = thread_buffer; +// using vec_t = ext_vector_t; + +using int8x2_v = vec_t; +using fp8x2_v = vec_t; +using fp16x2_v = vec_t; +using bf16x2_v = vec_t; +using fp32x2_v = vec_t; +struct fp4x2_t +{ + using type = uint8_t; + type data; + __host__ __device__ constexpr fp4x2_t() : data{type{}} {} + __host__ __device__ constexpr fp4x2_t(type init) : data{init} {} +}; +using fp4x2x2_v = vec_t; +using fp4x2x4_v = vec_t; +using fp4x2x8_v = vec_t; + +template <> +struct vector_traits +{ + using scalar_type = uint8_t; + static constexpr index_t vector_size = 1; +}; + +template <> +struct numeric +{ + // maximum finite value + CK_TILE_HOST_DEVICE static constexpr fp32_t max() { return 6.0f; } +}; +CK_TILE_DEVICE fp32x2_v amd_assembly_pk_mul_f32(fp32x2_v a, fp32x2_t b) +{ + fp32x2_v c; +#if defined(__gfx938__) || defined(__gfx946__) || defined(__gfx936__) + asm volatile("v_pk_mul_f32 %0, %1, %2" : "=v"(c) : "v"(a), "v"(b)); +#else + asm volatile("v_mul_f32 %[v_result0], %[v_a0], %[v_b0]\n\t" + "v_mul_f32 %[v_result1], %[v_a1], %[v_b1]\n\t" + : [v_result0] "=v"(c[0]), [v_result1] "=v"(c[1]) + : [v_a0] "v"(a[0]), [v_a1] "v"(a[1]), [v_b0] "v"(b[0]), [v_b1] "v"(b[1])); + +#endif + return c; +} +#ifdef ENABLE_FP8 +#if defined(__gfx938__) +CK_TILE_DEVICE fp8x2_v amd_assembly_cvt_pk_fp8_f32(fp32_t a, fp32_t b) +{ + int16x2_t c; + asm volatile("v_cvt_pk_fp8_f32 %0, %1, %2, %3 op_sel:[0,0,0,0]" : "=v"(c) : "v"(a), "v"(b), "v"(c)); + return bit_cast(c[0]); +} +CK_TILE_DEVICE fp8x2_v amd_assembly_cvt_pk_bf8_f32(fp32_t a, fp32_t b) +{ + int16x2_t c; + asm volatile("v_cvt_pk_bf8_f32 %0, %1, %2, %3 op_sel:[0,0,0,0]" : "=v"(c) : "v"(a), "v"(b), "v"(c)); + return bit_cast(c[0]); +} +#endif +#endif + +#if defined(__gfx946__) +CK_TILE_DEVICE fp4x2_t amd_assembly_cvt_scalef32_pk_fp4_f32(fp32_t a, fp32_t b, fp32_t scale) +{ + int16x2_t c; + // permute high bits and low bits to match the order of the original vector + asm volatile("v_cvt_scalef32_pk_fp4_f32 %0, %1, %2, %3" : "=v"(c) : "v"(b), "v"(a), "v"(scale)); + return bit_cast(bit_cast(c[0])[0]); +} +CK_TILE_DEVICE fp4x2_t amd_assembly_cvt_scalef32_pk_fp4_f16(fp16x2_v a, fp32_t scale) +{ + int16x2_t c; + // permute high bits and low bits to match the order of the original vector + asm volatile("v_cvt_scalef32_pk_fp4_f16 %0, %1, %2" : "=v"(c) : "v"(a), "v"(scale)); + return bit_cast(bit_cast(c[0])[0]); +} +CK_TILE_DEVICE fp4x2_t amd_assembly_cvt_scalef32_pk_fp4_bf16(bf16x2_v a, fp32_t scale) +{ + int16x2_t c; + // permute high bits and low bits to match the order of the original vector + asm volatile("v_cvt_scalef32_pk_fp4_bf16 %0, %1, %2" : "=v"(c) : "v"(a), "v"(scale)); + return bit_cast(bit_cast(c[0])[0]); +} +#endif + +// convert any to fp32x?_t one by one +template ), bool> = false> +CK_TILE_HOST_DEVICE constexpr vec_t vec_convert(vec_t x) +{ + using fp32xX_t = vec_t; + fp32xX_t tmp; + for(size_t i = 0; i < N; i++) + { + tmp[i] = type_convert(x[i]); + } + return tmp; +} + +template = false, + std::enable_if_t<(!(std::is_same_v)), bool> = false> +CK_TILE_HOST_DEVICE constexpr vec_t vec_convert(vec_t x, fp32_t inverted_scale) +{ + if constexpr(!std::is_same_v) + { + using fp32xX_t = vec_t; + fp32xX_t tmp = vec_convert(x); + return vec_convert(tmp, inverted_scale); + } + else + { + // fp32->?? + return vec_convert(x, inverted_scale); + } +} + +// fp32x2 -> fp8x2 +#if defined(__gfx938__) || defined(__gfx946__) +CK_TILE_HOST_DEVICE constexpr fp8x2_v fp32x2_t_to_fp8x2_t(fp32x2_v x, fp32_t inverted_scale) +{ + using vec_ti = vector_traits; + constexpr int vec_size = vec_ti::vector_size; + constexpr auto interpret = numeric_traits::f8_interpret; + fp32x2_v tmp = amd_assembly_pk_mul_f32(x, fp32x2_t{inverted_scale, inverted_scale}); + + return (interpret == fp8_interpretation::E4M3_FNUZ) || + (interpret == fp8_interpretation::E4M3_OCP) + ? amd_assembly_cvt_pk_fp8_f32(tmp[0], tmp[1]) + : amd_assembly_cvt_pk_bf8_f32(tmp[0], tmp[1]); +} +#endif +// fp32x2 -> int8x2 +CK_TILE_HOST_DEVICE constexpr int8x2_v fp32x2_t_to_int8x2_t(fp32x2_v x, fp32_t inverted_scale) +{ + fp32x2_v tmp = amd_assembly_pk_mul_f32(x, fp32x2_t{inverted_scale, inverted_scale}); + + int8x2_v out; + out[0] = static_cast(tmp[0]); + out[1] = static_cast(tmp[1]); + return out; +} +#if defined(__gfx946__) +// fp32x2 -> fp4x2 +CK_TILE_HOST_DEVICE constexpr fp4x2_t fp32x2_t_to_fp4x2_t(fp32x2_v x, fp32_t inverted_scale) +{ + return amd_assembly_cvt_scalef32_pk_fp4_f32(x[0], x[1], inverted_scale); +} +// fp16x2 -> fp4x2 +CK_TILE_HOST_DEVICE constexpr fp4x2_t fp16x2_t_to_fp4x2_t(fp16x2_v x, fp32_t inverted_scale) +{ + return amd_assembly_cvt_scalef32_pk_fp4_f16(x, inverted_scale); +} +// bf16x2 -> fp4x2 +CK_TILE_HOST_DEVICE constexpr fp4x2_t bf16x2_t_to_fp4x2_t(bf16x2_v x, fp32_t inverted_scale) +{ + return amd_assembly_cvt_scalef32_pk_fp4_bf16(x, inverted_scale); +} +#endif + +#define CK_TILE_TYPE_CONVERT(dtype_, stype_, vec_size_) \ + template <> \ + CK_TILE_HOST_DEVICE constexpr vec_t \ + vec_convert(vec_t x, \ + fp32_t inverted_scale) \ + { \ + constexpr int iter_num = vec_size_ / 2; \ + vec_t out; \ + using vec_i2 = vec_t; \ + using vec_o2 = vec_t; \ + _Pragma("unroll") for(size_t i = 0; i < iter_num; i++) \ + { \ + vec_o2 tmp = stype_##x2##_t_to_##dtype_##x2##_t(x.template get_as()(i), \ + inverted_scale); \ + out.template get_as()(i) = tmp; \ + } \ + return out; \ + } +#if defined(__gfx938__) || defined(__gfx946__) +CK_TILE_TYPE_CONVERT(fp8, fp32, 2) +CK_TILE_TYPE_CONVERT(fp8, fp32, 4) +CK_TILE_TYPE_CONVERT(fp8, fp32, 8) +CK_TILE_TYPE_CONVERT(fp8, fp32, 16) +CK_TILE_TYPE_CONVERT(fp8, fp32, 32) +#endif +CK_TILE_TYPE_CONVERT(int8, fp32, 2) +CK_TILE_TYPE_CONVERT(int8, fp32, 4) +CK_TILE_TYPE_CONVERT(int8, fp32, 8) +CK_TILE_TYPE_CONVERT(int8, fp32, 16) +CK_TILE_TYPE_CONVERT(int8, fp32, 32) +#undef CK_TILE_TYPE_CONVERT + +// 4 bit vec convert +// convert any to fp32x?_t one by one +#if defined(__gfx946__) +template = false, + std::enable_if_t<((std::is_same_v)), bool> = false> +CK_TILE_HOST_DEVICE constexpr vec_t vec_convert(vec_t x, fp32_t inverted_scale); + +#define CK_TILE_TYPE_CONVERT(dtype_, stype_, vec_size_) \ + template <> \ + CK_TILE_HOST_DEVICE constexpr vec_t \ + vec_convert(vec_t x, \ + fp32_t inverted_scale) \ + { \ + constexpr int iter_num = vec_size_ / 2; \ + vec_t out; \ + using vec_i2 = vec_t; \ + using vec_o2 = dtype_##_t; \ + _Pragma("unroll") for(size_t i = 0; i < iter_num; i++) \ + { \ + vec_o2 tmp = \ + stype_##x2##_t_to_##dtype_##_t(x.template get_as()(i), inverted_scale); \ + out.template get_as()(i) = tmp; \ + } \ + return out; \ + } + +CK_TILE_TYPE_CONVERT(fp4x2, fp32, 4) +CK_TILE_TYPE_CONVERT(fp4x2, fp32, 8) +CK_TILE_TYPE_CONVERT(fp4x2, fp32, 16) +CK_TILE_TYPE_CONVERT(fp4x2, fp32, 32) + +CK_TILE_TYPE_CONVERT(fp4x2, fp16, 4) +CK_TILE_TYPE_CONVERT(fp4x2, fp16, 8) +CK_TILE_TYPE_CONVERT(fp4x2, fp16, 16) +CK_TILE_TYPE_CONVERT(fp4x2, fp16, 32) + +CK_TILE_TYPE_CONVERT(fp4x2, bf16, 4) +CK_TILE_TYPE_CONVERT(fp4x2, bf16, 8) +CK_TILE_TYPE_CONVERT(fp4x2, bf16, 16) +CK_TILE_TYPE_CONVERT(fp4x2, bf16, 32) +#endif +#undef CK_TILE_TYPE_CONVERT + +} // namespace ck_tile diff --git a/csrc/include/communication_asm.h b/csrc/include/communication_asm.h new file mode 100644 index 0000000000000000000000000000000000000000..490049dd56f7d1ed489575907ccdef98d553d67d --- /dev/null +++ b/csrc/include/communication_asm.h @@ -0,0 +1,27 @@ +#pragma once +// SPDX-License-Identifier: MIT + +torch::Tensor all_reduce_asm(torch::Tensor &input, + int64_t _ca, + torch::Tensor ®_sig, torch::Tensor ®_buffer, bool isGraph); + +std::tuple // out, residual_out +all_reduce_rmsnorm(torch::Tensor &input, // [m ,n] + torch::Tensor &residual_in, // [m ,n] + torch::Tensor &weight, // [1 ,n] + torch::Tensor &bias, // [1 ,n] + float epsilon, + // following are fused_allreduce args + int64_t _ca, + torch::Tensor ®_sig, torch::Tensor ®_buffer, bool isGraph); + +std::tuple // out, residual_out, yscale +all_reduce_rmsnorm_quant(torch::Tensor &input, // [m ,n] + torch::Tensor &residual_in, // [m ,n] + torch::Tensor &xscale, // [1 ,n] + torch::Tensor &weight, // [1 ,n] + torch::Tensor &bias, // [1 ,n] + float epsilon, + // following are fused_allreduce args + int64_t _ca, + torch::Tensor ®_sig, torch::Tensor ®_buffer, bool isGraph); \ No newline at end of file diff --git a/csrc/include/custom.h b/csrc/include/custom.h new file mode 100644 index 0000000000000000000000000000000000000000..f7bbed0809d0472c05addecc151b5349d7f99c29 --- /dev/null +++ b/csrc/include/custom.h @@ -0,0 +1,11 @@ +#pragma once +// SPDX-License-Identifier: MIT + +#include + +void wvSpltK(at::Tensor &in_a, at::Tensor &in_b, at::Tensor &out_c, + const int64_t N_in, const int64_t CuCount); + +void LLMM1( + at::Tensor &in_a, at::Tensor &in_b, at::Tensor &out_c, + const int64_t rows_per_block); diff --git a/csrc/include/custom_all_reduce.cuh b/csrc/include/custom_all_reduce.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ec4de7b6b964f7beb55435165dea87133e644764 --- /dev/null +++ b/csrc/include/custom_all_reduce.cuh @@ -0,0 +1,1857 @@ +#pragma once +/* + * Copyright (C) 2024-2025, The vLLM team. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "aiter_hip_common.h" +#include "ck_tile/core.hpp" +#include "communication_asm.h" +#include "hip_float8.h" +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace aiter +{ + + constexpr int kMaxBlocks = 80; + // note: we don't want to use atomics for signals because peer atomics are no + // supported on PCIe links + struct Signal + { + alignas(128) uint32_t start[kMaxBlocks][8]; + alignas(128) uint32_t end[kMaxBlocks][8]; + alignas(128) uint32_t _flag[kMaxBlocks]; // incremental flags for each rank + }; + +#ifdef USE_ROCM + struct __align__(16) RankData { const void *ptrs[8]; }; +#else + struct __align__(16) RankData { const void *__restrict__ ptrs[8]; }; +#endif + + struct __align__(16) RankSignals + { +#ifndef USE_ROCM + volatile +#endif + Signal *signals[8]; + }; + + // like std::array, but aligned + template + struct __align__(alignof(T) * sz) array_t + { + T data[sz]; + using type = T; + static constexpr int size = sz; + }; + + // use packed type to maximize memory efficiency + // goal: generate ld.128 and st.128 instructions + template + struct packed_t + { + // the (P)acked type for load/store + using P = array_t; + // the (A)ccumulator type for reduction + using A = array_t; + }; + +#define DINLINE __device__ __forceinline__ + + // scalar cast functions + DINLINE float upcast_s(half val) { return __half2float(val); } + + template + DINLINE T downcast_s(float val); + template <> + DINLINE half downcast_s(float val) + { + return __float2half(val); + } + + // scalar add functions + // for some reason when compiling with Pytorch, the + operator for half and + // bfloat is disabled so we call the intrinsics directly + DINLINE half &assign_add(half &a, half b) + { + a = __hadd(a, b); + return a; + } + DINLINE float &assign_add(float &a, float b) { return a += b; } + +#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) + DINLINE float upcast_s(__hip_bfloat16 val) { return __bfloat162float(val); } + template <> + DINLINE __hip_bfloat16 downcast_s(float val) + { + return __float2bfloat16(val); + } + DINLINE __hip_bfloat16 &assign_add(__hip_bfloat16 &a, __hip_bfloat16 b) + { + a = __hadd(a, b); + return a; + } +#endif + + template + DINLINE array_t &packed_assign_add(array_t &a, array_t b) + { +#pragma unroll + for (int i = 0; i < N; i++) + { + assign_add(a.data[i], b.data[i]); + } + return a; + } + + template + DINLINE array_t upcast(array_t val) + { + if constexpr (std::is_same::value) + { + return val; + } + else + { + array_t out; +#pragma unroll + for (int i = 0; i < N; i++) + { + out.data[i] = upcast_s(val.data[i]); + } + return out; + } + } + + template + DINLINE O downcast(array_t val) + { + if constexpr (std::is_same::value) + { + return val; + } + // else if constexpr (std::is_same::value) + // { + // O out; + // #pragma unroll + // for (int i = 0; i < O::size; i++) + // { + // union fcvt { + // uint32_t i32; + // float f32; + // } u; + // u.f32 = val.data[i]; + // out.data[i] = __builtin_bit_cast(__hip_bfloat16, uint16_t(u.i32 >> 16)); + // } + // return out; + // } + else + { + O out; +#pragma unroll + for (int i = 0; i < O::size; i++) + { + out.data[i] = downcast_s(val.data[i]); + } + return out; + } + } + + // This function is meant to be used as the first synchronization in the all + // reduce kernel. Thus, it doesn't need to make any visibility guarantees for + // prior memory accesses. Note: volatile writes will not be reordered against + // other volatile writes. + template + DINLINE void start_sync(const RankSignals &sg, +#ifndef USE_ROCM + volatile +#endif + Signal *self_sg, + int rank) + { +#ifdef USE_ROCM + uint32_t flag = self_sg->_flag[blockIdx.x] + 1; + if (threadIdx.x < ngpus) + { + // simultaneously write to the corresponding flag of all ranks. + // Latency = 1 p2p write + __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank], + flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); + // wait until we got true from all ranks + while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x], + __ATOMIC_RELAXED, + __MEMORY_SCOPE_DEVICE) < flag) + ; + } + __syncthreads(); + // use one thread to update flag + if (threadIdx.x == 0) + self_sg->_flag[blockIdx.x] = flag; +#else + if (threadIdx.x < ngpus) + { + // reset flag for next time + self_sg->end[blockIdx.x][threadIdx.x] = 0; + // simultaneously write to the corresponding flag of all ranks. + // Latency = 1 p2p write + sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1; + // wait until we got true from all ranks + while (!self_sg->start[blockIdx.x][threadIdx.x]) + ; + } + __syncthreads(); +#endif + } + + // This function is meant to be used as the second or the final synchronization + // barrier in the all reduce kernel. If it's the final synchronization barrier, + // we don't need to make any visibility guarantees for prior memory accesses. + template + DINLINE void end_sync(const RankSignals &sg, +#ifndef USE_ROCM + volatile +#endif + Signal *self_sg, + int rank) + { +#ifdef USE_ROCM + __syncthreads(); + // eliminate the case that prior writes are not visible after signals become + // visible. Note that I did not managed to make this happen through a lot of + // testing. Might be the case that hardware provides stronger guarantee than + // the memory model. + uint32_t flag = self_sg->_flag[blockIdx.x] + 1; + if (threadIdx.x < ngpus) + { + // simultaneously write to the corresponding flag of all ranks. + // Latency = 1 p2p write + __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank], + flag, + final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE, + __MEMORY_SCOPE_SYSTEM); + // wait until we got true from all ranks + while ( + __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x], + final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE, + __MEMORY_SCOPE_DEVICE) < flag) + ; + } + __syncthreads(); + // use one thread to update flag + if (threadIdx.x == 0) + self_sg->_flag[blockIdx.x] = flag; +#else + __syncthreads(); + // eliminate the case that prior writes are not visible after signals become + // visible. Note that I did not managed to make this happen through a lot of + // testing. Might be the case that hardware provides stronger guarantee than + // the memory model. + if constexpr (!final_sync) + __threadfence_system(); + if (threadIdx.x < ngpus) + { + // reset flag for next time + self_sg->start[blockIdx.x][threadIdx.x] = 0; + // simultaneously write to the corresponding flag of all ranks. + // Latency = 1 p2p write + sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1; + // wait until we got true from all ranks + while (!self_sg->end[blockIdx.x][threadIdx.x]) + ; + } + if constexpr (!final_sync) + __syncthreads(); +#endif + } + + template + DINLINE P packed_reduce(const P *ptrs[], int idx) + { + A tmp = upcast(ptrs[0][idx]); +#pragma unroll + for (int i = 1; i < ngpus; i++) + { + packed_assign_add(tmp, upcast(ptrs[i][idx])); + } + return downcast

(tmp); + } + + template + __global__ void __launch_bounds__(512, 1) + cross_device_reduce_1stage_naive(RankData *_dp, RankSignals sg, +#ifndef USE_ROCM + volatile +#endif + Signal *self_sg, + T *__restrict__ result, int rank, int size) + { + using P = typename packed_t::P; + using A = typename packed_t::A; + // note: we don't reorder the address so the accumulation order is the same + // for all ranks, ensuring bitwise identical results + auto dp = *_dp; + start_sync(sg, self_sg, rank); + // do the actual reduction + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; + idx += gridDim.x * blockDim.x) + { + ((P *)result)[idx] = packed_reduce((const P **)&dp.ptrs[0], idx); + } + end_sync(sg, self_sg, rank); + // // Step-2 consumes data written by peers in step-1, so we need + // // visibility guarantees from this barrier. + // end_sync(sg, self_sg, rank); + } + + template +#ifdef USE_ROCM + DINLINE P *get_tmp_buf(Signal *sg) + { +#else + DINLINE P *get_tmp_buf(volatile Signal *sg) + { +#endif + return (P *)(((Signal *)sg) + 1); + } + + template + __global__ void __launch_bounds__(512, 1) + cross_device_reduce_2stage_naive(RankData *_dp, RankSignals sg, +#ifndef USE_ROCM + volatile +#endif + Signal *self_sg, + T *__restrict__ result, int rank, int size) + { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + using P = typename packed_t::P; + using A = typename packed_t::A; + int part = size / ngpus; + int start = rank * part; + int end = rank == ngpus - 1 ? size : start + part; + int largest_part = part + size % ngpus; + const P *ptrs[ngpus]; + P *tmps[ngpus]; +#pragma unroll + for (int i = 0; i < ngpus; i++) + { + int target = (rank + i) % ngpus; + ptrs[i] = (const P *)_dp->ptrs[target]; + tmps[i] = get_tmp_buf

(sg.signals[target]); + } + auto tmp_out = tmps[0]; + start_sync(sg, self_sg, rank); + // stage 1: reduce scatter + for (int idx = start + tid; idx < end; idx += stride) + { + tmp_out[idx - start] = packed_reduce(ptrs, idx); + } + end_sync(sg, self_sg, rank); + + // stage 2: allgather. Note: it's important to match the tid between + // the two stages, because visibility across devices is only guaranteed + // between threads that have the same tid. If thread i computes the sum of + // start + i in the first stage, then thread i also gathers start + i from all + // ranks. + for (int idx = tid; idx < largest_part; idx += stride) + { +#pragma unroll + for (int i = 0; i < ngpus; i++) + { + int gather_from_rank = ((rank + i) % ngpus); + if (gather_from_rank == ngpus - 1 || idx < part) + { + int dst_idx = gather_from_rank * part + idx; + ((P *)result)[dst_idx] = tmps[i][idx]; + } + } + } + } + +#define THREAD_NUM 512 + +// Toggle whether fused allreduce+rmsnorm keeps per-element rms input in float +// before the final cast to output dtype. +#ifndef AITER_FUSED_AR_RMS_KEEP_RMS_INP_F32 +#define AITER_FUSED_AR_RMS_KEEP_RMS_INP_F32 1 +#endif + + template + __global__ void __launch_bounds__(512, 1) + cross_device_reduce_1stage(RankData *_dp, RankSignals sg, +#ifndef USE_ROCM + volatile +#endif + Signal *self_sg, + T *__restrict__ result, int rank, int size) + { + using P = typename packed_t::P; + using A = typename packed_t::A; + constexpr int pack_size = packed_t::P::size; + constexpr int tnum_gpu = THREAD_NUM / ngpus; + __shared__ T tmp_smem[tnum_gpu * ngpus * pack_size]; + // note: we don't reorder the address so the accumulation order is the same + // for all ranks, ensuring bitwise identical results + auto dp = *_dp; + + // load one gpu data each wave + int warp_id = threadIdx.x / tnum_gpu; + int lane_id = threadIdx.x % tnum_gpu; + start_sync(sg, self_sg, rank); + // do the actual reduction + for (int idx = blockIdx.x * tnum_gpu + lane_id; idx < size; + idx += gridDim.x * tnum_gpu) + { + *(reinterpret_cast(&tmp_smem[0]) + threadIdx.x) = ((const P**)&dp.ptrs[0])[warp_id][idx]; + __syncthreads(); + if (warp_id == 0) + { + A add_reg; +#pragma unroll + for (int i = 0; i < pack_size; ++i) + { + add_reg.data[i] = ck_tile::type_convert(tmp_smem[threadIdx.x * pack_size + i]); + } + constexpr int smem_gpu_loop_stride = tnum_gpu * pack_size; +#pragma unroll + for (int i = 1; i < ngpus; ++i) + { +#pragma unroll + for (int j = 0; j < pack_size; ++j) + { + add_reg.data[j] += ck_tile::type_convert(tmp_smem[smem_gpu_loop_stride * i + threadIdx.x * pack_size + j]); + } + } + P write_reg; +#pragma unroll + for (int i = 0; i < pack_size; ++i) + { + write_reg.data[i] = ck_tile::type_convert(add_reg.data[i]); + } + ((P *)result)[idx] = write_reg; + } + __syncthreads(); + } + // maybe do not need device sync + // end_sync(sg, self_sg, rank); + } + + template + __global__ void __launch_bounds__(512, 1) + cross_device_reduce_2stage(RankData *_dp, RankSignals sg, +#ifndef USE_ROCM + volatile +#endif + Signal *self_sg, + T *__restrict__ result, int rank, int size) + { + constexpr int pack_size = packed_t::P::size; + constexpr int tnum_gpu = THREAD_NUM / ngpus; + using P = typename packed_t::P; + using A = typename packed_t::A; + __shared__ T tmp_smem[tnum_gpu * ngpus * pack_size]; + int warp_id = threadIdx.x / tnum_gpu; + int lane_id = threadIdx.x % tnum_gpu; + int tid = blockIdx.x * tnum_gpu + lane_id; + int stride = gridDim.x * tnum_gpu; + int part = size / ngpus; + int start = rank * part; + int end = rank == ngpus - 1 ? size : start + part; + int largest_part = part + size % ngpus; + const P *ptrs[ngpus]; + P *tmps[ngpus]; +#pragma unroll + for (int i = 0; i < ngpus; i++) + { + int target = (rank + i) % ngpus; + ptrs[i] = (const P *)_dp->ptrs[target]; + tmps[i] = get_tmp_buf

(sg.signals[target]); + } + auto tmp_out = tmps[0]; + start_sync(sg, self_sg, rank); + // stage 1: reduce scatter + for (int idx = start + tid; idx < end; idx += stride) + { + *(reinterpret_cast(&tmp_smem[0]) + threadIdx.x) = ptrs[warp_id][idx]; + __syncthreads(); + // cal add in first 64 threads + if (warp_id == 0) + { + A add_reg; +#pragma unroll + for (int i = 0; i < pack_size; ++i) + { + add_reg.data[i] = ck_tile::type_convert(tmp_smem[pack_size * threadIdx.x + i]); + } + constexpr int smem_gpu_loop_stride = tnum_gpu * pack_size; +#pragma unroll + for (int i = 1; i < ngpus; ++i) + { +#pragma unroll + for (int j = 0; j < pack_size; ++j) + { + add_reg.data[j] += ck_tile::type_convert(tmp_smem[i * smem_gpu_loop_stride + pack_size * threadIdx.x + j]); + } + } + P write_reg; +#pragma unroll + for (int i = 0; i < pack_size; ++i) + { + write_reg.data[i] = ck_tile::type_convert(add_reg.data[i]); + } + tmp_out[idx - start] = write_reg; + } + __syncthreads(); + } + end_sync(sg, self_sg, rank); + + // stage 2: allgather. Note: it's important to match the tid between + // the two stages, because visibility across devices is only guaranteed + // between threads that have the same tid. If thread i computes the sum of + // start + i in the first stage, then thread i also gathers start + i from all + // ranks. + for (int idx = tid; idx < largest_part; idx += stride) + { + int dst_idx = (warp_id + rank) % ngpus * part + idx; + ((P *)result)[dst_idx] = tmps[warp_id][idx]; + } + } + + /* + * naive allgather + * for case: input(1345,) + * */ + template + __global__ void __launch_bounds__(512, 1) allgather_naive( + RankData* _dp, + RankSignals sg, + Signal* self_sg, + T* __restrict__ result, + int rank, + int size + ) + { + constexpr int tnum_gpu = THREAD_NUM / ngpus; + int warp_id = threadIdx.x / tnum_gpu; + int lane_id = threadIdx.x % tnum_gpu; + int tid = blockIdx.x * tnum_gpu + lane_id; + int stride = gridDim.x * tnum_gpu; + const T* ptrs[ngpus]; + +#pragma unroll + for (int i = 0; i < ngpus; ++i) + { + ptrs[i] = (const T*)_dp->ptrs[i]; + } + start_sync(sg, self_sg, rank); + + for (int idx = tid; idx < size; idx += stride) + { + int write_idx = warp_id * size + idx; + result[write_idx] = ptrs[warp_id][idx]; + } + } + + template + __global__ void __launch_bounds__(512, 1) allgather_vec( + RankData* _dp, + RankSignals sg, + Signal* self_sg, + T* __restrict__ result, + int rank, + int size + ) + { + constexpr int tnum_gpu = THREAD_NUM / ngpus; + using P = typename packed_t::P; + int warp_id = threadIdx.x / tnum_gpu; + int lane_id = threadIdx.x % tnum_gpu; + int tid = blockIdx.x * tnum_gpu + lane_id; + int stride = gridDim.x * tnum_gpu; + const P* ptrs[ngpus]; + +#pragma unroll + for (int i = 0; i < ngpus; ++i) + { + ptrs[i] = (const P*)_dp->ptrs[i]; + } + start_sync(sg, self_sg, rank); + + for (int idx = tid; idx < size; idx += stride) + { + int write_idx = warp_id * size + idx; + *(reinterpret_cast(&result[0]) + write_idx) = ptrs[warp_id][idx]; + } + } + + // fp8 quant all-reduce code start + template + struct Fp16Filter + { + static const bool value = false; + }; + + template <> + struct Fp16Filter + { + static const bool value = true; + }; + + template + struct Bf16Filter + { + static const bool value = false; + }; + + template <> + struct Bf16Filter<__hip_bfloat16> + { + static const bool value = true; + }; + + // dtypes only support half and bf16 now +#define FP16_FILTER \ + typename std::enable_if::value, void>::type* = nullptr + +#define BF16_FILTER \ + typename std::enable_if::value, void>::type* = nullptr + + template